blob: 6b6ccbefb7557f28131f6fbd223e2b1d03c994df [file] [log] [blame]
Guido van Rossumdb25f321997-07-10 14:31:32 +00001/*
2 * -*- mode: c-mode; c-file-style: python -*-
3 */
4
Guido van Rossum004c1e11997-05-09 02:35:58 +00005/* regexpr.c
6 *
7 * Author: Tatu Ylonen <ylo@ngs.fi>
8 *
9 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
10 *
11 * Permission to use, copy, modify, distribute, and sell this software
12 * and its documentation for any purpose is hereby granted without
13 * fee, provided that the above copyright notice appear in all copies.
14 * This software is provided "as is" without express or implied
15 * warranty.
16 *
17 * Created: Thu Sep 26 17:14:05 1991 ylo
18 * Last modified: Mon Nov 4 17:06:48 1991 ylo
19 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
20 *
21 * This code draws many ideas from the regular expression packages by
22 * Henry Spencer of the University of Toronto and Richard Stallman of
23 * the Free Software Foundation.
24 *
25 * Emacs-specific code and syntax table code is almost directly borrowed
26 * from GNU regexp.
27 *
28 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
29 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
30 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
31 * probably one or two others that I'm forgetting.
32 *
33 * $Id$ */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000034
Guido van Rossum339cfa31996-08-08 19:12:37 +000035#include "config.h" /* For Win* specific redefinition of printf c.s. */
Guido van Rossum339cfa31996-08-08 19:12:37 +000036
Guido van Rossum004c1e11997-05-09 02:35:58 +000037#include "myproto.h" /* For PROTO macro --Guido */
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000038
Guido van Rossumb674c3b1992-01-19 16:32:47 +000039#include <stdio.h>
Guido van Rossum004c1e11997-05-09 02:35:58 +000040
41#ifndef NDEBUG
42#define NDEBUG 1
43#endif
44
Guido van Rossumb674c3b1992-01-19 16:32:47 +000045#include <assert.h>
46#include "regexpr.h"
47
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000048#ifdef THINK_C
49/* Think C on the Mac really needs these headers... --Guido */
50#include <stdlib.h>
51#include <string.h>
52#else
Guido van Rossum88661e81996-05-23 22:55:58 +000053#if defined(__STDC__) || defined(_MSC_VER)
Guido van Rossum9abc5391992-03-27 17:24:37 +000054/* Don't mess around, use the standard headers */
55#include <stdlib.h>
56#include <string.h>
57#else
Guido van Rossumb674c3b1992-01-19 16:32:47 +000058char *malloc();
59void free();
60char *realloc();
Guido van Rossum9abc5391992-03-27 17:24:37 +000061#endif /* __STDC__ */
62#endif /* THINK_C */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000063
Guido van Rossumdb25f321997-07-10 14:31:32 +000064/* The original code blithely assumed that sizeof(short) == 2. Not
65 * always true. Original instances of "(short)x" were replaced by
66 * SHORT(x), where SHORT is #defined below. */
67
68#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
69
Guido van Rossum004c1e11997-05-09 02:35:58 +000070/* The stack implementation is taken from an idea by Andrew Kuchling.
71 * It's a doubly linked list of arrays. The advantages of this over a
72 * simple linked list are that the number of mallocs required are
73 * reduced. It also makes it possible to statically allocate enough
74 * space so that small patterns don't ever need to call malloc.
75 *
76 * The advantages over a single array is that is periodically
77 * realloced when more space is needed is that we avoid ever copying
78 * the stack. */
79
80/* item_t is the basic stack element. Defined as a union of
81 * structures so that both registers, failure points, and counters can
82 * be pushed/popped from the stack. There's nothing built into the
83 * item to keep track of whether a certain stack item is a register, a
84 * failure point, or a counter. */
85
86typedef union item_t
87{
Guido van Rossumdb25f321997-07-10 14:31:32 +000088 struct
89 {
90 int num;
91 int level;
92 char *start;
93 char *end;
94 } reg;
95 struct
96 {
97 int count;
98 int level;
99 int phantom;
100 char *code;
101 char *text;
102 } fail;
103 struct
104 {
105 int num;
106 int level;
107 int count;
108 } cntr;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000109} item_t;
110
111#define STACK_PAGE_SIZE 256
112#define NUM_REGISTERS 256
113
114/* A 'page' of stack items. */
115
116typedef struct item_page_t
117{
Guido van Rossumdb25f321997-07-10 14:31:32 +0000118 item_t items[STACK_PAGE_SIZE];
119 struct item_page_t *prev;
120 struct item_page_t *next;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000121} item_page_t;
122
123
124typedef struct match_state
125{
Guido van Rossumdb25f321997-07-10 14:31:32 +0000126 /* The number of registers that have been pushed onto the stack
127 * since the last failure point. */
Guido van Rossum004c1e11997-05-09 02:35:58 +0000128
Guido van Rossumdb25f321997-07-10 14:31:32 +0000129 int count;
130
131 /* Used to control when registers need to be pushed onto the
132 * stack. */
133
134 int level;
135
136 /* The number of failure points on the stack. */
137
138 int point;
139
140 /* Storage for the registers. Each register consists of two
141 * pointers to characters. So register N is represented as
142 * start[N] and end[N]. The pointers must be converted to
143 * offsets from the beginning of the string before returning the
144 * registers to the calling program. */
145
146 char *start[NUM_REGISTERS];
147 char *end[NUM_REGISTERS];
148
149 /* Keeps track of whether a register has changed recently. */
150
151 int changed[NUM_REGISTERS];
152
153 /* Structure to encapsulate the stack. */
154 struct
155 {
156 /* index into the curent page. If index == 0 and you need
157 * to pop an item, move to the previous page and set index
158 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
159 * push a page. If index == STACK_PAGE_SIZE and you need
160 * to push a page move to the next page and set index =
161 * 0. If there is no new next page, allocate a new page
162 * and link it in. Otherwise, increment index to push a
163 * page. */
164
165 int index;
166 item_page_t *current; /* Pointer to the current page. */
167 item_page_t first; /* First page is statically allocated. */
168 } stack;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000169} match_state;
170
Guido van Rossumdb25f321997-07-10 14:31:32 +0000171/* Initialize a state object */
172
173/* #define NEW_STATE(state) \ */
174/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
175/* state.stack.current = &state.stack.first; \ */
176/* state.stack.first.prev = NULL; \ */
177/* state.stack.first.next = NULL; \ */
178/* state.stack.index = 0; \ */
179/* state.level = 1 */
180
181#define NEW_STATE(state, nregs) \
182{ \
183 int i; \
184 for (i = 0; i < nregs; i++) \
185 { \
186 state.start[i] = NULL; \
187 state.end[i] = NULL; \
188 state.changed[i] = 0; \
189 } \
190 state.stack.current = &state.stack.first; \
191 state.stack.first.prev = NULL; \
192 state.stack.first.next = NULL; \
193 state.stack.index = 0; \
194 state.level = 1; \
195 state.count = 0; \
196 state.level = 0; \
197 state.point = 0; \
198}
199
200/* Free any memory that might have been malloc'd */
201
202#define FREE_STATE(state) \
203while(state.stack.first.next != NULL) \
204{ \
205 state.stack.current = state.stack.first.next; \
206 state.stack.first.next = state.stack.current->next; \
207 free(state.stack.current); \
208}
209
Guido van Rossum004c1e11997-05-09 02:35:58 +0000210/* Discard the top 'count' stack items. */
211
212#define STACK_DISCARD(stack, count, on_error) \
213stack.index -= count; \
214while (stack.index < 0) \
215{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000216 if (stack.current->prev == NULL) \
217 on_error; \
218 stack.current = stack.current->prev; \
219 stack.index += STACK_PAGE_SIZE; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000220}
221
222/* Store a pointer to the previous item on the stack. Used to pop an
223 * item off of the stack. */
224
225#define STACK_PREV(stack, top, on_error) \
226if (stack.index == 0) \
227{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000228 if (stack.current->prev == NULL) \
229 on_error; \
230 stack.current = stack.current->prev; \
231 stack.index = STACK_PAGE_SIZE - 1; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000232} \
233else \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000234{ \
235 stack.index--; \
236} \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000237top = &(stack.current->items[stack.index])
238
239/* Store a pointer to the next item on the stack. Used to push an item
240 * on to the stack. */
241
242#define STACK_NEXT(stack, top, on_error) \
243if (stack.index == STACK_PAGE_SIZE) \
244{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000245 if (stack.current->next == NULL) \
246 { \
247 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
248 if (stack.current->next == NULL) \
249 on_error; \
250 stack.current->next->prev = stack.current; \
251 stack.current->next->next = NULL; \
252 } \
253 stack.current = stack.current->next; \
254 stack.index = 0; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000255} \
256top = &(stack.current->items[stack.index++])
257
258/* Store a pointer to the item that is 'count' items back in the
259 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
260 * STACK_TOP(stack, top, on_error). */
261
262#define STACK_BACK(stack, top, count, on_error) \
263{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000264 int index; \
265 item_page_t *current; \
266 current = stack.current; \
267 index = stack.index - (count); \
268 while (index < 0) \
269 { \
270 if (current->prev == NULL) \
271 on_error; \
272 current = current->prev; \
273 index += STACK_PAGE_SIZE; \
274 } \
275 top = &(current->items[index]); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000276}
277
278/* Store a pointer to the top item on the stack. Execute the
279 * 'on_error' code if there are no items on the stack. */
280
281#define STACK_TOP(stack, top, on_error) \
282if (stack.index == 0) \
283{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000284 if (stack.current->prev == NULL) \
285 on_error; \
286 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000287} \
288else \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000289{ \
290 top = &(stack.current->items[stack.index - 1]); \
291}
Guido van Rossum004c1e11997-05-09 02:35:58 +0000292
293/* Test to see if the stack is empty */
294
295#define STACK_EMPTY(stack) ((stack.index == 0) && \
296 (stack.current->prev == NULL))
297
Guido van Rossum004c1e11997-05-09 02:35:58 +0000298/* Return the start of register 'reg' */
299
300#define GET_REG_START(state, reg) (state.start[reg])
301
302/* Return the end of register 'reg' */
303
304#define GET_REG_END(state, reg) (state.end[reg])
305
306/* Set the start of register 'reg'. If the state of the register needs
307 * saving, push it on the stack. */
308
309#define SET_REG_START(state, reg, text, on_error) \
310if(state.changed[reg] < state.level) \
311{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000312 item_t *item; \
313 STACK_NEXT(state.stack, item, on_error); \
314 item->reg.num = reg; \
315 item->reg.start = state.start[reg]; \
316 item->reg.end = state.end[reg]; \
317 item->reg.level = state.changed[reg]; \
318 state.changed[reg] = state.level; \
319 state.count++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000320} \
321state.start[reg] = text
322
323/* Set the end of register 'reg'. If the state of the register needs
324 * saving, push it on the stack. */
325
326#define SET_REG_END(state, reg, text, on_error) \
327if(state.changed[reg] < state.level) \
328{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000329 item_t *item; \
330 STACK_NEXT(state.stack, item, on_error); \
331 item->reg.num = reg; \
332 item->reg.start = state.start[reg]; \
333 item->reg.end = state.end[reg]; \
334 item->reg.level = state.changed[reg]; \
335 state.changed[reg] = state.level; \
336 state.count++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000337} \
338state.end[reg] = text
339
340#define PUSH_FAILURE(state, xcode, xtext, on_error) \
341{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000342 item_t *item; \
343 STACK_NEXT(state.stack, item, on_error); \
344 item->fail.code = xcode; \
345 item->fail.text = xtext; \
346 item->fail.count = state.count; \
347 item->fail.level = state.level; \
348 item->fail.phantom = 0; \
349 state.count = 0; \
350 state.level++; \
351 state.point++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000352}
353
354/* Update the last failure point with a new position in the text. */
355
Guido van Rossum004c1e11997-05-09 02:35:58 +0000356#define UPDATE_FAILURE(state, xtext, on_error) \
357{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000358 item_t *item; \
359 STACK_BACK(state.stack, item, state.count + 1, on_error); \
360 if (!item->fail.phantom) \
361 { \
362 item_t *item2; \
363 STACK_NEXT(state.stack, item2, on_error); \
364 item2->fail.code = item->fail.code; \
365 item2->fail.text = xtext; \
366 item2->fail.count = state.count; \
367 item2->fail.level = state.level; \
368 item2->fail.phantom = 1; \
369 state.count = 0; \
370 state.level++; \
371 state.point++; \
372 } \
373 else \
374 { \
375 STACK_DISCARD(state.stack, state.count, on_error); \
376 STACK_TOP(state.stack, item, on_error); \
377 item->fail.text = xtext; \
378 state.count = 0; \
379 state.level++; \
380 } \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000381}
382
383#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
384{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000385 item_t *item; \
386 do \
387 { \
388 while(state.count > 0) \
389 { \
390 STACK_PREV(state.stack, item, on_error); \
391 state.start[item->reg.num] = item->reg.start; \
392 state.end[item->reg.num] = item->reg.end; \
393 state.changed[item->reg.num] = item->reg.level; \
394 state.count--; \
395 } \
396 STACK_PREV(state.stack, item, on_empty); \
397 xcode = item->fail.code; \
398 xtext = item->fail.text; \
399 state.count = item->fail.count; \
400 state.level = item->fail.level; \
401 state.point--; \
402 } \
403 while (item->fail.text == NULL); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000404}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000405
406enum regexp_compiled_ops /* opcodes for compiled regexp */
407{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000408 Cend, /* end of pattern reached */
409 Cbol, /* beginning of line */
410 Ceol, /* end of line */
411 Cset, /* character set. Followed by 32 bytes of set. */
412 Cexact, /* followed by a byte to match */
413 Canychar, /* matches any character except newline */
414 Cstart_memory, /* set register start addr (followed by reg number) */
415 Cend_memory, /* set register end addr (followed by reg number) */
416 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
417 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
418 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
419 Cfailure_jump, /* jump to addr on failure */
420 Cupdate_failure_jump, /* update topmost failure point and jump */
421 Cdummy_failure_jump, /* push a dummy failure point and jump */
422 Cbegbuf, /* match at beginning of buffer */
423 Cendbuf, /* match at end of buffer */
424 Cwordbeg, /* match at beginning of word */
425 Cwordend, /* match at end of word */
426 Cwordbound, /* match if at word boundary */
427 Cnotwordbound, /* match if not at word boundary */
428 Csyntaxspec, /* matches syntax code (1 byte follows) */
429 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte foll)*/
430 Crepeat1
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000431};
432
433enum regexp_syntax_op /* syntax codes for plain and quoted characters */
434{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000435 Rend, /* special code for end of regexp */
436 Rnormal, /* normal character */
437 Ranychar, /* any character except newline */
438 Rquote, /* the quote character */
439 Rbol, /* match beginning of line */
440 Reol, /* match end of line */
441 Roptional, /* match preceding expression optionally */
442 Rstar, /* match preceding expr zero or more times */
443 Rplus, /* match preceding expr one or more times */
444 Ror, /* match either of alternatives */
445 Ropenpar, /* opening parenthesis */
446 Rclosepar, /* closing parenthesis */
447 Rmemory, /* match memory register */
448 Rextended_memory, /* \vnn to match registers 10-99 */
449 Ropenset, /* open set. Internal syntax hard-coded below. */
450 /* the following are gnu extensions to "normal" regexp syntax */
451 Rbegbuf, /* beginning of buffer */
452 Rendbuf, /* end of buffer */
453 Rwordchar, /* word character */
454 Rnotwordchar, /* not word character */
455 Rwordbeg, /* beginning of word */
456 Rwordend, /* end of word */
457 Rwordbound, /* word bound */
458 Rnotwordbound, /* not word bound */
459 Rnum_ops
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000460};
461
462static int re_compile_initialized = 0;
463static int regexp_syntax = 0;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000464int re_syntax = 0; /* Exported copy of regexp_syntax */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000465static unsigned char regexp_plain_ops[256];
466static unsigned char regexp_quoted_ops[256];
467static unsigned char regexp_precedences[Rnum_ops];
468static int regexp_context_indep_ops;
469static int regexp_ansi_sequences;
470
471#define NUM_LEVELS 5 /* number of precedence levels in use */
472#define MAX_NESTING 100 /* max nesting level of operators */
473
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000474#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
475#define Sword 1
476
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000477static char re_syntax_table[256];
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000478
Guido van Rossum004c1e11997-05-09 02:35:58 +0000479static void re_compile_initialize(void)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000480{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000481 int a;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000482
Guido van Rossumfaf49081997-07-15 01:47:08 +0000483 static int syntax_table_inited = 0;
484
485 if (!syntax_table_inited)
486 {
487 syntax_table_inited = 1;
488 memset(re_syntax_table, 0, 256);
489 for (a = 'a'; a <= 'z'; a++)
490 re_syntax_table[a] = Sword;
491 for (a = 'A'; a <= 'Z'; a++)
492 re_syntax_table[a] = Sword;
493 for (a = '0'; a <= '9'; a++)
494 re_syntax_table[a] = Sword;
495 }
496 re_compile_initialized = 1;
497 for (a = 0; a < 256; a++)
498 {
499 regexp_plain_ops[a] = Rnormal;
500 regexp_quoted_ops[a] = Rnormal;
501 }
502 for (a = '0'; a <= '9'; a++)
503 regexp_quoted_ops[a] = Rmemory;
504 regexp_plain_ops['\134'] = Rquote;
505 if (regexp_syntax & RE_NO_BK_PARENS)
506 {
507 regexp_plain_ops['('] = Ropenpar;
508 regexp_plain_ops[')'] = Rclosepar;
509 }
510 else
511 {
512 regexp_quoted_ops['('] = Ropenpar;
513 regexp_quoted_ops[')'] = Rclosepar;
514 }
515 if (regexp_syntax & RE_NO_BK_VBAR)
516 regexp_plain_ops['\174'] = Ror;
517 else
518 regexp_quoted_ops['\174'] = Ror;
519 regexp_plain_ops['*'] = Rstar;
520 if (regexp_syntax & RE_BK_PLUS_QM)
521 {
522 regexp_quoted_ops['+'] = Rplus;
523 regexp_quoted_ops['?'] = Roptional;
524 }
525 else
526 {
527 regexp_plain_ops['+'] = Rplus;
528 regexp_plain_ops['?'] = Roptional;
529 }
530 if (regexp_syntax & RE_NEWLINE_OR)
531 regexp_plain_ops['\n'] = Ror;
532 regexp_plain_ops['\133'] = Ropenset;
533 regexp_plain_ops['\136'] = Rbol;
534 regexp_plain_ops['$'] = Reol;
535 regexp_plain_ops['.'] = Ranychar;
536 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
537 {
538 regexp_quoted_ops['w'] = Rwordchar;
539 regexp_quoted_ops['W'] = Rnotwordchar;
540 regexp_quoted_ops['<'] = Rwordbeg;
541 regexp_quoted_ops['>'] = Rwordend;
542 regexp_quoted_ops['b'] = Rwordbound;
543 regexp_quoted_ops['B'] = Rnotwordbound;
544 regexp_quoted_ops['`'] = Rbegbuf;
545 regexp_quoted_ops['\''] = Rendbuf;
546 }
547 if (regexp_syntax & RE_ANSI_HEX)
548 regexp_quoted_ops['v'] = Rextended_memory;
549 for (a = 0; a < Rnum_ops; a++)
550 regexp_precedences[a] = 4;
551 if (regexp_syntax & RE_TIGHT_VBAR)
552 {
553 regexp_precedences[Ror] = 3;
554 regexp_precedences[Rbol] = 2;
555 regexp_precedences[Reol] = 2;
556 }
557 else
558 {
559 regexp_precedences[Ror] = 2;
560 regexp_precedences[Rbol] = 3;
561 regexp_precedences[Reol] = 3;
562 }
563 regexp_precedences[Rclosepar] = 1;
564 regexp_precedences[Rend] = 0;
565 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
566 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000567}
568
Guido van Rossum004c1e11997-05-09 02:35:58 +0000569int re_set_syntax(int syntax)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000570{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000571 int ret;
572
573 ret = regexp_syntax;
574 regexp_syntax = syntax;
575 re_syntax = syntax; /* Exported copy */
576 re_compile_initialize();
577 return ret;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000578}
579
Guido van Rossum004c1e11997-05-09 02:35:58 +0000580static int hex_char_to_decimal(int ch)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000581{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000582 if (ch >= '0' && ch <= '9')
583 return ch - '0';
584 if (ch >= 'a' && ch <= 'f')
585 return ch - 'a' + 10;
586 if (ch >= 'A' && ch <= 'F')
587 return ch - 'A' + 10;
588 return 16;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000589}
590
Guido van Rossum004c1e11997-05-09 02:35:58 +0000591static void re_compile_fastmap_aux(char *code,
592 int pos,
593 char *visited,
594 char *can_be_null,
595 char *fastmap)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000596{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000597 int a;
598 int b;
599 int syntaxcode;
600
601 if (visited[pos])
602 return; /* we have already been here */
603 visited[pos] = 1;
604 for (;;)
605 switch (code[pos++])
606 {
607 case Cend:
608 {
609 *can_be_null = 1;
610 return;
611 }
612 case Cbol:
613 case Cbegbuf:
614 case Cendbuf:
615 case Cwordbeg:
616 case Cwordend:
617 case Cwordbound:
618 case Cnotwordbound:
619 {
620 for (a = 0; a < 256; a++)
621 fastmap[a] = 1;
622 break;
623 }
624 case Csyntaxspec:
625 {
626 syntaxcode = code[pos++];
627 for (a = 0; a < 256; a++)
628 if (SYNTAX(a) == syntaxcode)
629 fastmap[a] = 1;
630 return;
631 }
632 case Cnotsyntaxspec:
633 {
634 syntaxcode = code[pos++];
635 for (a = 0; a < 256; a++)
636 if (SYNTAX(a) != syntaxcode)
637 fastmap[a] = 1;
638 return;
639 }
640 case Ceol:
641 {
642 fastmap['\n'] = 1;
643 if (*can_be_null == 0)
644 *can_be_null = 2; /* can match null, but only at end of buffer*/
645 return;
646 }
647 case Cset:
648 {
649 for (a = 0; a < 256/8; a++)
650 if (code[pos + a] != 0)
651 for (b = 0; b < 8; b++)
652 if (code[pos + a] & (1 << b))
653 fastmap[(a << 3) + b] = 1;
654 pos += 256/8;
655 return;
656 }
657 case Cexact:
658 {
659 fastmap[(unsigned char)code[pos]] = 1;
660 return;
661 }
662 case Canychar:
663 {
664 for (a = 0; a < 256; a++)
665 if (a != '\n')
666 fastmap[a] = 1;
667 return;
668 }
669 case Cstart_memory:
670 case Cend_memory:
671 {
672 pos++;
673 break;
674 }
675 case Cmatch_memory:
676 {
677 for (a = 0; a < 256; a++)
678 fastmap[a] = 1;
679 *can_be_null = 1;
680 return;
681 }
682 case Cjump:
683 case Cdummy_failure_jump:
684 case Cupdate_failure_jump:
685 case Cstar_jump:
686 {
687 a = (unsigned char)code[pos++];
688 a |= (unsigned char)code[pos++] << 8;
689 pos += (int)SHORT(a);
690 if (visited[pos])
691 {
692 /* argh... the regexp contains empty loops. This is not
693 good, as this may cause a failure stack overflow when
694 matching. Oh well. */
695 /* this path leads nowhere; pursue other paths. */
696 return;
697 }
698 visited[pos] = 1;
699 break;
700 }
701 case Cfailure_jump:
702 {
703 a = (unsigned char)code[pos++];
704 a |= (unsigned char)code[pos++] << 8;
705 a = pos + (int)SHORT(a);
706 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
707 break;
708 }
709 case Crepeat1:
710 {
711 pos += 2;
712 break;
713 }
714 default:
715 {
716 abort(); /* probably some opcode is missing from this switch */
717 /*NOTREACHED*/
718 }
719 }
Guido van Rossum004c1e11997-05-09 02:35:58 +0000720}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000721
Guido van Rossum004c1e11997-05-09 02:35:58 +0000722static int re_do_compile_fastmap(char *buffer,
723 int used,
724 int pos,
725 char *can_be_null,
726 char *fastmap)
727{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000728 char small_visited[512], *visited;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000729
Guido van Rossumfaf49081997-07-15 01:47:08 +0000730 if (used <= sizeof(small_visited))
731 visited = small_visited;
732 else
733 {
734 visited = malloc(used);
735 if (!visited)
736 return 0;
737 }
738 *can_be_null = 0;
739 memset(fastmap, 0, 256);
740 memset(visited, 0, used);
741 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
742 if (visited != small_visited)
743 free(visited);
744 return 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000745}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000746
Guido van Rossum004c1e11997-05-09 02:35:58 +0000747void re_compile_fastmap(regexp_t bufp)
748{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000749 if (!bufp->fastmap || bufp->fastmap_accurate)
750 return;
751 assert(bufp->used > 0);
752 if (!re_do_compile_fastmap(bufp->buffer,
753 bufp->used,
754 0,
755 &bufp->can_be_null,
756 bufp->fastmap))
757 return;
758 if (bufp->buffer[0] == Cbol)
759 bufp->anchor = 1; /* begline */
760 else
761 if (bufp->buffer[0] == Cbegbuf)
762 bufp->anchor = 2; /* begbuf */
763 else
764 bufp->anchor = 0; /* none */
765 bufp->fastmap_accurate = 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000766}
767
768/*
769 * star is coded as:
770 * 1: failure_jump 2
771 * ... code for operand of star
772 * star_jump 1
773 * 2: ... code after star
774 *
775 * We change the star_jump to update_failure_jump if we can determine
776 * that it is safe to do so; otherwise we change it to an ordinary
777 * jump.
778 *
779 * plus is coded as
780 *
781 * jump 2
782 * 1: failure_jump 3
783 * 2: ... code for operand of plus
784 * star_jump 1
785 * 3: ... code after plus
786 *
787 * For star_jump considerations this is processed identically to star.
788 *
789 */
790
791static int re_optimize_star_jump(regexp_t bufp, char *code)
792{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000793 char map[256];
794 char can_be_null;
795 char *p1;
796 char *p2;
797 char ch;
798 int a;
799 int b;
800 int num_instructions = 0;
801
802 a = (unsigned char)*code++;
803 a |= (unsigned char)*code++ << 8;
804 a = (int)SHORT(a);
805
806 p1 = code + a + 3; /* skip the failure_jump */
807 assert(p1[-3] == Cfailure_jump);
808 p2 = code;
809 /* p1 points inside loop, p2 points to after loop */
810 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
811 p2 - bufp->buffer, &can_be_null, map))
812 goto make_normal_jump;
813
814 /* If we might introduce a new update point inside the
815 * loop, we can't optimize because then update_jump would
816 * update a wrong failure point. Thus we have to be
817 * quite careful here.
818 */
819
820 /* loop until we find something that consumes a character */
Guido van Rossum004c1e11997-05-09 02:35:58 +0000821 loop_p1:
Guido van Rossumfaf49081997-07-15 01:47:08 +0000822 num_instructions++;
823 switch (*p1++)
824 {
825 case Cbol:
826 case Ceol:
827 case Cbegbuf:
828 case Cendbuf:
829 case Cwordbeg:
830 case Cwordend:
831 case Cwordbound:
832 case Cnotwordbound:
833 {
834 goto loop_p1;
835 }
836 case Cstart_memory:
837 case Cend_memory:
838 {
839 p1++;
840 goto loop_p1;
841 }
842 case Cexact:
843 {
844 ch = (unsigned char)*p1++;
845 if (map[(int)ch])
846 goto make_normal_jump;
847 break;
848 }
849 case Canychar:
850 {
851 for (b = 0; b < 256; b++)
852 if (b != '\n' && map[b])
853 goto make_normal_jump;
854 break;
855 }
856 case Cset:
857 {
858 for (b = 0; b < 256; b++)
859 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
860 goto make_normal_jump;
861 p1 += 256/8;
862 break;
863 }
864 default:
865 {
866 goto make_normal_jump;
867 }
868 }
869 /* now we know that we can't backtrack. */
870 while (p1 != p2 - 3)
871 {
872 num_instructions++;
873 switch (*p1++)
874 {
875 case Cend:
876 {
877 return 0;
878 }
879 case Cbol:
880 case Ceol:
881 case Canychar:
882 case Cbegbuf:
883 case Cendbuf:
884 case Cwordbeg:
885 case Cwordend:
886 case Cwordbound:
887 case Cnotwordbound:
888 {
889 break;
890 }
891 case Cset:
892 {
893 p1 += 256/8;
894 break;
895 }
896 case Cexact:
897 case Cstart_memory:
898 case Cend_memory:
899 case Cmatch_memory:
900 case Csyntaxspec:
901 case Cnotsyntaxspec:
902 {
903 p1++;
904 break;
905 }
906 case Cjump:
907 case Cstar_jump:
908 case Cfailure_jump:
909 case Cupdate_failure_jump:
910 case Cdummy_failure_jump:
911 {
912 goto make_normal_jump;
913 }
914 default:
915 {
916 return 0;
917 break;
918 }
919 }
920 }
921
Guido van Rossumdb25f321997-07-10 14:31:32 +0000922 make_update_jump:
Guido van Rossumfaf49081997-07-15 01:47:08 +0000923 code -= 3;
924 a += 3; /* jump to after the Cfailure_jump */
925 code[0] = Cupdate_failure_jump;
926 code[1] = a & 0xff;
927 code[2] = a >> 8;
928 if (num_instructions > 1)
929 return 1;
930 assert(num_instructions == 1);
931 /* if the only instruction matches a single character, we can do
932 * better */
933 p1 = code + 3 + a; /* start of sole instruction */
934 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
935 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
936 code[0] = Crepeat1;
937 return 1;
938
Guido van Rossum004c1e11997-05-09 02:35:58 +0000939 make_normal_jump:
Guido van Rossumfaf49081997-07-15 01:47:08 +0000940 code -= 3;
941 *code = Cjump;
942 return 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000943}
944
945static int re_optimize(regexp_t bufp)
946{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000947 char *code;
948
949 code = bufp->buffer;
950
951 while(1)
952 {
953 switch (*code++)
954 {
955 case Cend:
956 {
957 return 1;
958 }
959 case Canychar:
960 case Cbol:
961 case Ceol:
962 case Cbegbuf:
963 case Cendbuf:
964 case Cwordbeg:
965 case Cwordend:
966 case Cwordbound:
967 case Cnotwordbound:
968 {
969 break;
970 }
971 case Cset:
972 {
973 code += 256/8;
974 break;
975 }
976 case Cexact:
977 case Cstart_memory:
978 case Cend_memory:
979 case Cmatch_memory:
980 case Csyntaxspec:
981 case Cnotsyntaxspec:
982 {
983 code++;
984 break;
985 }
986 case Cstar_jump:
987 {
988 if (!re_optimize_star_jump(bufp, code))
989 {
990 return 0;
991 }
992 /* fall through */
993 }
994 case Cupdate_failure_jump:
995 case Cjump:
996 case Cdummy_failure_jump:
997 case Cfailure_jump:
998 case Crepeat1:
999 {
1000 code += 2;
1001 break;
1002 }
1003 default:
1004 {
1005 return 0;
1006 }
1007 }
1008 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001009}
1010
1011#define NEXTCHAR(var) \
1012{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001013 if (pos >= size) \
1014 goto ends_prematurely; \
1015 (var) = regex[pos]; \
1016 pos++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001017}
1018
1019#define ALLOC(amount) \
1020{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001021 if (pattern_offset+(amount) > alloc) \
1022 { \
1023 alloc += 256 + (amount); \
1024 pattern = realloc(pattern, alloc); \
1025 if (!pattern) \
1026 goto out_of_memory; \
1027 } \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001028}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001029
1030#define STORE(ch) pattern[pattern_offset++] = (ch)
1031
1032#define CURRENT_LEVEL_START (starts[starts_base + current_level])
1033
1034#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
1035
Guido van Rossum004c1e11997-05-09 02:35:58 +00001036#define PUSH_LEVEL_STARTS \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001037if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
1038 starts_base += NUM_LEVELS; \
1039else \
1040 goto too_complex \
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001041
1042#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
1043
Guido van Rossum004c1e11997-05-09 02:35:58 +00001044#define PUT_ADDR(offset,addr) \
1045{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001046 int disp = (addr) - (offset) - 2; \
1047 pattern[(offset)] = disp & 0xff; \
1048 pattern[(offset)+1] = (disp>>8) & 0xff; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001049}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001050
Guido van Rossum004c1e11997-05-09 02:35:58 +00001051#define INSERT_JUMP(pos,type,addr) \
1052{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001053 int a, p = (pos), t = (type), ad = (addr); \
1054 for (a = pattern_offset - 1; a >= p; a--) \
1055 pattern[a + 3] = pattern[a]; \
1056 pattern[p] = t; \
1057 PUT_ADDR(p+1,ad); \
1058 pattern_offset += 3; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001059}
Guido van Rossumfaf49081997-07-15 01:47:08 +00001060
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001061#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
1062
Guido van Rossum004c1e11997-05-09 02:35:58 +00001063#define SET_FIELDS \
1064{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001065 bufp->allocated = alloc; \
1066 bufp->buffer = pattern; \
1067 bufp->used = pattern_offset; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001068}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001069
Guido van Rossum004c1e11997-05-09 02:35:58 +00001070#define GETHEX(var) \
1071{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001072 char gethex_ch, gethex_value; \
1073 NEXTCHAR(gethex_ch); \
1074 gethex_value = hex_char_to_decimal(gethex_ch); \
1075 if (gethex_value == 16) \
1076 goto hex_error; \
1077 NEXTCHAR(gethex_ch); \
1078 gethex_ch = hex_char_to_decimal(gethex_ch); \
1079 if (gethex_ch == 16) \
1080 goto hex_error; \
1081 (var) = gethex_value * 16 + gethex_ch; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001082}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001083
Guido van Rossumfaf49081997-07-15 01:47:08 +00001084#define ANSI_TRANSLATE(ch) \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001085{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001086 switch (ch) \
1087 { \
1088 case 'a': \
1089 case 'A': \
1090 { \
1091 ch = 7; /* audible bell */ \
1092 break; \
1093 } \
1094 case 'b': \
1095 case 'B': \
1096 { \
1097 ch = 8; /* backspace */ \
1098 break; \
1099 } \
1100 case 'f': \
1101 case 'F': \
1102 { \
1103 ch = 12; /* form feed */ \
1104 break; \
1105 } \
1106 case 'n': \
1107 case 'N': \
1108 { \
1109 ch = 10; /* line feed */ \
1110 break; \
1111 } \
1112 case 'r': \
1113 case 'R': \
1114 { \
1115 ch = 13; /* carriage return */ \
1116 break; \
1117 } \
1118 case 't': \
1119 case 'T': \
1120 { \
1121 ch = 9; /* tab */ \
1122 break; \
1123 } \
1124 case 'v': \
1125 case 'V': \
1126 { \
1127 ch = 11; /* vertical tab */ \
1128 break; \
1129 } \
1130 case 'x': /* hex code */ \
1131 case 'X': \
1132 { \
1133 GETHEX(ch); \
1134 break; \
1135 } \
1136 default: \
1137 { \
1138 /* other characters passed through */ \
1139 if (translate) \
1140 ch = translate[(unsigned char)ch]; \
1141 break; \
1142 } \
1143 } \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001144}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001145
Guido van Rossum004c1e11997-05-09 02:35:58 +00001146char *re_compile_pattern(char *regex, int size, regexp_t bufp)
1147{
Guido van Rossumfaf49081997-07-15 01:47:08 +00001148 int a;
1149 int pos;
1150 int op;
1151 int current_level;
1152 int level;
1153 int opcode;
1154 int pattern_offset = 0, alloc;
1155 int starts[NUM_LEVELS * MAX_NESTING];
1156 int starts_base;
1157 int future_jumps[MAX_NESTING];
1158 int num_jumps;
1159 unsigned char ch = '\0';
1160 char *pattern;
1161 char *translate;
1162 int next_register;
1163 int paren_depth;
1164 int num_open_registers;
1165 int open_registers[RE_NREGS];
1166 int beginning_context;
1167
1168 if (!re_compile_initialized)
1169 re_compile_initialize();
1170 bufp->used = 0;
1171 bufp->fastmap_accurate = 0;
1172 bufp->uses_registers = 1;
1173 bufp->num_registers = 1;
1174 translate = bufp->translate;
1175 pattern = bufp->buffer;
1176 alloc = bufp->allocated;
1177 if (alloc == 0 || pattern == NULL)
1178 {
1179 alloc = 256;
1180 pattern = malloc(alloc);
1181 if (!pattern)
1182 goto out_of_memory;
1183 }
1184 pattern_offset = 0;
1185 starts_base = 0;
1186 num_jumps = 0;
1187 current_level = 0;
1188 SET_LEVEL_START;
1189 num_open_registers = 0;
1190 next_register = 1;
1191 paren_depth = 0;
1192 beginning_context = 1;
1193 op = -1;
1194 /* we use Rend dummy to ensure that pending jumps are updated
1195 (due to low priority of Rend) before exiting the loop. */
1196 pos = 0;
1197 while (op != Rend)
1198 {
1199 if (pos >= size)
1200 op = Rend;
1201 else
1202 {
1203 NEXTCHAR(ch);
1204 if (translate)
1205 ch = translate[(unsigned char)ch];
1206 op = regexp_plain_ops[(unsigned char)ch];
1207 if (op == Rquote)
1208 {
1209 NEXTCHAR(ch);
1210 op = regexp_quoted_ops[(unsigned char)ch];
1211 if (op == Rnormal && regexp_ansi_sequences)
1212 ANSI_TRANSLATE(ch);
1213 }
1214 }
1215 level = regexp_precedences[op];
1216 /* printf("ch='%c' op=%d level=%d current_level=%d
1217 curlevstart=%d\n", ch, op, level, current_level,
1218 CURRENT_LEVEL_START); */
1219 if (level > current_level)
1220 {
1221 for (current_level++; current_level < level; current_level++)
1222 SET_LEVEL_START;
1223 SET_LEVEL_START;
1224 }
1225 else
1226 if (level < current_level)
1227 {
1228 current_level = level;
1229 for (;num_jumps > 0 &&
1230 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
1231 num_jumps--)
1232 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
1233 }
1234 switch (op)
1235 {
1236 case Rend:
1237 {
1238 break;
1239 }
1240 case Rnormal:
1241 {
1242 normal_char:
1243 opcode = Cexact;
1244 store_opcode_and_arg: /* opcode & ch must be set */
1245 SET_LEVEL_START;
1246 ALLOC(2);
1247 STORE(opcode);
1248 STORE(ch);
1249 break;
1250 }
1251 case Ranychar:
1252 {
1253 opcode = Canychar;
1254 store_opcode:
1255 SET_LEVEL_START;
1256 ALLOC(1);
1257 STORE(opcode);
1258 break;
1259 }
1260 case Rquote:
1261 {
1262 abort();
1263 /*NOTREACHED*/
1264 }
1265 case Rbol:
1266 {
1267 if (!beginning_context)
1268 if (regexp_context_indep_ops)
1269 goto op_error;
1270 else
1271 goto normal_char;
1272 opcode = Cbol;
1273 goto store_opcode;
1274 }
1275 case Reol:
1276 {
1277 if (!((pos >= size) ||
1278 ((regexp_syntax & RE_NO_BK_VBAR) ?
1279 (regex[pos] == '\174') :
1280 (pos+1 < size && regex[pos] == '\134' &&
1281 regex[pos+1] == '\174')) ||
1282 ((regexp_syntax & RE_NO_BK_PARENS)?
1283 (regex[pos] == ')'):
1284 (pos+1 < size && regex[pos] == '\134' &&
1285 regex[pos+1] == ')'))))
1286 if (regexp_context_indep_ops)
1287 goto op_error;
1288 else
1289 goto normal_char;
1290 opcode = Ceol;
1291 goto store_opcode;
1292 /* NOTREACHED */
1293 break;
1294 }
1295 case Roptional:
1296 {
1297 if (beginning_context)
1298 if (regexp_context_indep_ops)
1299 goto op_error;
1300 else
1301 goto normal_char;
1302 if (CURRENT_LEVEL_START == pattern_offset)
1303 break; /* ignore empty patterns for ? */
1304 ALLOC(3);
1305 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1306 pattern_offset + 3);
1307 break;
1308 }
1309 case Rstar:
1310 case Rplus:
1311 {
1312 if (beginning_context)
1313 if (regexp_context_indep_ops)
1314 goto op_error;
1315 else
1316 goto normal_char;
1317 if (CURRENT_LEVEL_START == pattern_offset)
1318 break; /* ignore empty patterns for + and * */
1319 ALLOC(9);
1320 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1321 pattern_offset + 6);
1322 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1323 if (op == Rplus) /* jump over initial failure_jump */
1324 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1325 CURRENT_LEVEL_START + 6);
1326 break;
1327 }
1328 case Ror:
1329 {
1330 ALLOC(6);
1331 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1332 pattern_offset + 6);
1333 if (num_jumps >= MAX_NESTING)
1334 goto too_complex;
1335 STORE(Cjump);
1336 future_jumps[num_jumps++] = pattern_offset;
1337 STORE(0);
1338 STORE(0);
1339 SET_LEVEL_START;
1340 break;
1341 }
1342 case Ropenpar:
1343 {
1344 SET_LEVEL_START;
1345 if (next_register < RE_NREGS)
1346 {
1347 bufp->uses_registers = 1;
1348 ALLOC(2);
1349 STORE(Cstart_memory);
1350 STORE(next_register);
1351 open_registers[num_open_registers++] = next_register;
1352 bufp->num_registers++;
1353 next_register++;
1354 }
1355 paren_depth++;
1356 PUSH_LEVEL_STARTS;
1357 current_level = 0;
1358 SET_LEVEL_START;
1359 break;
1360 }
1361 case Rclosepar:
1362 {
1363 if (paren_depth <= 0)
1364 goto parenthesis_error;
1365 POP_LEVEL_STARTS;
1366 current_level = regexp_precedences[Ropenpar];
1367 paren_depth--;
1368 if (paren_depth < num_open_registers)
1369 {
1370 bufp->uses_registers = 1;
1371 ALLOC(2);
1372 STORE(Cend_memory);
1373 num_open_registers--;
1374 STORE(open_registers[num_open_registers]);
1375 }
1376 break;
1377 }
1378 case Rmemory:
1379 {
1380 if (ch == '0')
1381 goto bad_match_register;
1382 assert(ch >= '0' && ch <= '9');
1383 bufp->uses_registers = 1;
1384 opcode = Cmatch_memory;
1385 ch -= '0';
1386 goto store_opcode_and_arg;
1387 }
1388 case Rextended_memory:
1389 {
1390 NEXTCHAR(ch);
1391 if (ch < '0' || ch > '9')
1392 goto bad_match_register;
1393 NEXTCHAR(a);
1394 if (a < '0' || a > '9')
1395 goto bad_match_register;
1396 ch = 10 * (a - '0') + ch - '0';
1397 if (ch <= 0 || ch >= RE_NREGS)
1398 goto bad_match_register;
1399 bufp->uses_registers = 1;
1400 opcode = Cmatch_memory;
1401 goto store_opcode_and_arg;
1402 }
1403 case Ropenset:
1404 {
1405 int complement;
1406 int prev;
1407 int offset;
1408 int range;
1409 int firstchar;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001410
Guido van Rossumfaf49081997-07-15 01:47:08 +00001411 SET_LEVEL_START;
1412 ALLOC(1+256/8);
1413 STORE(Cset);
1414 offset = pattern_offset;
1415 for (a = 0; a < 256/8; a++)
1416 STORE(0);
1417 NEXTCHAR(ch);
1418 if (translate)
1419 ch = translate[(unsigned char)ch];
1420 if (ch == '\136')
1421 {
1422 complement = 1;
1423 NEXTCHAR(ch);
1424 if (translate)
1425 ch = translate[(unsigned char)ch];
1426 }
1427 else
1428 complement = 0;
1429 prev = -1;
1430 range = 0;
1431 firstchar = 1;
1432 while (ch != '\135' || firstchar)
1433 {
1434 firstchar = 0;
1435 if (regexp_ansi_sequences && ch == '\134')
1436 {
1437 NEXTCHAR(ch);
1438 ANSI_TRANSLATE(ch);
1439 }
1440 if (range)
1441 {
1442 for (a = prev; a <= (int)ch; a++)
1443 SETBIT(pattern, offset, a);
1444 prev = -1;
1445 range = 0;
1446 }
1447 else
1448 if (prev != -1 && ch == '-')
1449 range = 1;
1450 else
1451 {
1452 SETBIT(pattern, offset, ch);
1453 prev = ch;
1454 }
1455 NEXTCHAR(ch);
1456 if (translate)
1457 ch = translate[(unsigned char)ch];
1458 }
1459 if (range)
1460 SETBIT(pattern, offset, '-');
1461 if (complement)
1462 {
1463 for (a = 0; a < 256/8; a++)
1464 pattern[offset+a] ^= 0xff;
1465 }
1466 break;
1467 }
1468 case Rbegbuf:
1469 {
1470 opcode = Cbegbuf;
1471 goto store_opcode;
1472 }
1473 case Rendbuf:
1474 {
1475 opcode = Cendbuf;
1476 goto store_opcode;
1477 }
1478 case Rwordchar:
1479 {
1480 opcode = Csyntaxspec;
1481 ch = Sword;
1482 goto store_opcode_and_arg;
1483 }
1484 case Rnotwordchar:
1485 {
1486 opcode = Cnotsyntaxspec;
1487 ch = Sword;
1488 goto store_opcode_and_arg;
1489 }
1490 case Rwordbeg:
1491 {
1492 opcode = Cwordbeg;
1493 goto store_opcode;
1494 }
1495 case Rwordend:
1496 {
1497 opcode = Cwordend;
1498 goto store_opcode;
1499 }
1500 case Rwordbound:
1501 {
1502 opcode = Cwordbound;
1503 goto store_opcode;
1504 }
1505 case Rnotwordbound:
1506 {
1507 opcode = Cnotwordbound;
1508 goto store_opcode;
1509 }
1510 default:
1511 {
1512 abort();
1513 }
1514 }
1515 beginning_context = (op == Ropenpar || op == Ror);
1516 }
1517 if (starts_base != 0)
1518 goto parenthesis_error;
1519 assert(num_jumps == 0);
1520 ALLOC(1);
1521 STORE(Cend);
1522 SET_FIELDS;
1523 if(!re_optimize(bufp))
1524 return "Optimization error";
1525 return NULL;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001526
Guido van Rossum004c1e11997-05-09 02:35:58 +00001527 op_error:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001528 SET_FIELDS;
1529 return "Badly placed special character";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001530
Guido van Rossum004c1e11997-05-09 02:35:58 +00001531 bad_match_register:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001532 SET_FIELDS;
1533 return "Bad match register number";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001534
1535 hex_error:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001536 SET_FIELDS;
1537 return "Bad hexadecimal number";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001538
1539 parenthesis_error:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001540 SET_FIELDS;
1541 return "Badly placed parenthesis";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001542
1543 out_of_memory:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001544 SET_FIELDS;
1545 return "Out of memory";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001546
1547 ends_prematurely:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001548 SET_FIELDS;
1549 return "Regular expression ends prematurely";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001550
Guido van Rossum004c1e11997-05-09 02:35:58 +00001551 too_complex:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001552 SET_FIELDS;
1553 return "Regular expression too complex";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001554}
Guido van Rossum004c1e11997-05-09 02:35:58 +00001555
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001556#undef CHARAT
1557#undef NEXTCHAR
1558#undef GETHEX
1559#undef ALLOC
1560#undef STORE
1561#undef CURRENT_LEVEL_START
1562#undef SET_LEVEL_START
1563#undef PUSH_LEVEL_STARTS
1564#undef POP_LEVEL_STARTS
1565#undef PUT_ADDR
1566#undef INSERT_JUMP
1567#undef SETBIT
1568#undef SET_FIELDS
1569
Guido van Rossum004c1e11997-05-09 02:35:58 +00001570#define PREFETCH if (text == textend) goto fail
1571
1572#define NEXTCHAR(var) \
1573PREFETCH; \
1574var = (unsigned char)*text++; \
1575if (translate) \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001576 var = translate[var]
Guido van Rossum004c1e11997-05-09 02:35:58 +00001577
1578int re_match(regexp_t bufp,
1579 char *string,
1580 int size,
1581 int pos,
1582 regexp_registers_t old_regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001583{
Guido van Rossumfaf49081997-07-15 01:47:08 +00001584 char *code;
1585 char *translate;
1586 char *text;
1587 char *textstart;
1588 char *textend;
1589 int a;
1590 int b;
1591 int ch;
1592 int reg;
1593 int match_end;
1594 char *regstart;
1595 char *regend;
1596 int regsize;
1597 match_state state;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001598
Guido van Rossumfaf49081997-07-15 01:47:08 +00001599 assert(pos >= 0 && size >= 0);
1600 assert(pos <= size);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001601
Guido van Rossumfaf49081997-07-15 01:47:08 +00001602 text = string + pos;
1603 textstart = string;
1604 textend = string + size;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001605
Guido van Rossumfaf49081997-07-15 01:47:08 +00001606 code = bufp->buffer;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001607
Guido van Rossumfaf49081997-07-15 01:47:08 +00001608 translate = bufp->translate;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001609
Guido van Rossumfaf49081997-07-15 01:47:08 +00001610 NEW_STATE(state, bufp->num_registers);
1611
1612 if (!re_compile_initialized)
1613 re_compile_initialize();
Guido van Rossum004c1e11997-05-09 02:35:58 +00001614
1615 continue_matching:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001616 switch (*code++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001617 {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001618 case Cend:
Guido van Rossum004c1e11997-05-09 02:35:58 +00001619 {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001620 match_end = text - textstart;
1621 if (old_regs)
1622 {
1623 old_regs->start[0] = pos;
1624 old_regs->end[0] = match_end;
1625 if (!bufp->uses_registers)
1626 {
1627 for (a = 1; a < RE_NREGS; a++)
1628 {
1629 old_regs->start[a] = -1;
1630 old_regs->end[a] = -1;
1631 }
1632 }
1633 else
1634 {
1635 for (a = 1; a < bufp->num_registers; a++)
1636 {
1637 if ((GET_REG_START(state, a) == NULL) ||
1638 (GET_REG_END(state, a) == NULL))
1639 {
1640 old_regs->start[a] = -1;
1641 old_regs->end[a] = -1;
1642 continue;
1643 }
1644 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1645 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1646 }
1647 for (; a < RE_NREGS; a++)
1648 {
1649 old_regs->start[a] = -1;
1650 old_regs->end[a] = -1;
1651 }
1652 }
1653 }
1654 FREE_STATE(state);
1655 return match_end - pos;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001656 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001657 case Cbol:
1658 {
1659 if (text == textstart || text[-1] == '\n')
1660 goto continue_matching;
1661 goto fail;
1662 }
1663 case Ceol:
1664 {
1665 if (text == textend || *text == '\n')
1666 goto continue_matching;
1667 goto fail;
1668 }
1669 case Cset:
1670 {
1671 NEXTCHAR(ch);
1672 if (code[ch/8] & (1<<(ch & 7)))
1673 {
1674 code += 256/8;
1675 goto continue_matching;
1676 }
1677 goto fail;
1678 }
1679 case Cexact:
1680 {
1681 NEXTCHAR(ch);
1682 if (ch != (unsigned char)*code++)
1683 goto fail;
1684 goto continue_matching;
1685 }
1686 case Canychar:
1687 {
1688 NEXTCHAR(ch);
1689 if (ch == '\n')
1690 goto fail;
1691 goto continue_matching;
1692 }
1693 case Cstart_memory:
1694 {
1695 reg = *code++;
1696 SET_REG_START(state, reg, text, goto error);
1697 goto continue_matching;
1698 }
1699 case Cend_memory:
1700 {
1701 reg = *code++;
1702 SET_REG_END(state, reg, text, goto error);
1703 goto continue_matching;
1704 }
1705 case Cmatch_memory:
1706 {
1707 reg = *code++;
1708 regstart = GET_REG_START(state, reg);
1709 regend = GET_REG_END(state, reg);
1710 if ((regstart == NULL) || (regend == NULL))
1711 goto fail; /* or should we just match nothing? */
1712 regsize = regend - regstart;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001713
Guido van Rossumfaf49081997-07-15 01:47:08 +00001714 if (regsize > (textend - text))
1715 goto fail;
1716 if(translate)
1717 {
1718 for (; regstart < regend; regstart++, text++)
1719 if (translate[*regstart] != translate[*text])
1720 goto fail;
1721 }
1722 else
1723 for (; regstart < regend; regstart++, text++)
1724 if (*regstart != *text)
1725 goto fail;
1726 goto continue_matching;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001727 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001728 case Cupdate_failure_jump:
Guido van Rossumdb25f321997-07-10 14:31:32 +00001729 {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001730 UPDATE_FAILURE(state, text, goto error);
1731 /* fall to next case */
Guido van Rossumdb25f321997-07-10 14:31:32 +00001732 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001733 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1734 case Cstar_jump:
1735 case Cjump:
1736 {
1737 a = (unsigned char)*code++;
1738 a |= (unsigned char)*code++ << 8;
1739 code += (int)SHORT(a);
1740 goto continue_matching;
1741 }
1742 case Cdummy_failure_jump:
1743 {
1744 a = (unsigned char)*code++;
1745 a |= (unsigned char)*code++ << 8;
1746 a = (int)SHORT(a);
1747 assert(*code == Cfailure_jump);
1748 b = (unsigned char)code[1];
1749 b |= (unsigned char)code[2] << 8;
1750 PUSH_FAILURE(state, code + (int)SHORT(b) + 3, NULL, goto error);
1751 code += a;
1752 goto continue_matching;
1753 }
1754 case Cfailure_jump:
1755 {
1756 a = (unsigned char)*code++;
1757 a |= (unsigned char)*code++ << 8;
1758 a = (int)SHORT(a);
1759 PUSH_FAILURE(state, code + a, text, goto error);
1760 goto continue_matching;
1761 }
1762 case Crepeat1:
1763 {
1764 char *pinst;
1765 a = (unsigned char)*code++;
1766 a |= (unsigned char)*code++ << 8;
1767 a = (int)SHORT(a);
1768 pinst = code + a;
1769 /* pinst is sole instruction in loop, and it matches a
1770 * single character. Since Crepeat1 was originally a
1771 * Cupdate_failure_jump, we also know that backtracking
1772 * is useless: so long as the single-character
1773 * expression matches, it must be used. Also, in the
1774 * case of +, we've already matched one character, so +
1775 * can't fail: nothing here can cause a failure. */
1776 switch (*pinst++)
1777 {
1778 case Cset:
1779 {
1780 if (translate)
1781 {
1782 while (text < textend)
1783 {
1784 ch = translate[(unsigned char)*text];
1785 if (pinst[ch/8] & (1<<(ch & 7)))
1786 text++;
1787 else
1788 break;
1789 }
1790 }
1791 else
1792 {
1793 while (text < textend)
1794 {
1795 ch = (unsigned char)*text;
1796 if (pinst[ch/8] & (1<<(ch & 7)))
1797 text++;
1798 else
1799 break;
1800 }
1801 }
1802 break;
1803 }
1804 case Cexact:
1805 {
1806 ch = (unsigned char)*pinst;
1807 if (translate)
1808 {
1809 while (text < textend &&
1810 translate[(unsigned char)*text] == ch)
1811 text++;
1812 }
1813 else
1814 {
1815 while (text < textend && (unsigned char)*text == ch)
1816 text++;
1817 }
1818 break;
1819 }
1820 case Canychar:
1821 {
1822 while (text < textend && (unsigned char)*text != '\n')
1823 text++;
1824 break;
1825 }
1826 case Csyntaxspec:
1827 {
1828 a = (unsigned char)*pinst;
1829 if (translate)
1830 {
1831 while (text < textend &&
1832 translate[SYNTAX(*text)] == a)
1833 text++;
1834 }
1835 else
1836 {
1837 while (text < textend && SYNTAX(*text) == a)
1838 text++;
1839 }
1840 break;
1841 }
1842 case Cnotsyntaxspec:
1843 {
1844 a = (unsigned char)*pinst;
1845 if (translate)
1846 {
1847 while (text < textend &&
1848 translate[SYNTAX(*text)] != a)
1849 text++;
1850 }
1851 else
1852 {
1853 while (text < textend && SYNTAX(*text) != a)
1854 text++;
1855 }
1856 break;
1857 }
1858 default:
1859 {
1860 abort();
1861 /*NOTREACHED*/
1862 }
1863 }
1864 /* due to the funky way + and * are compiled, the top
1865 * failure- stack entry at this point is actually a
1866 * success entry -- update it & pop it */
1867 UPDATE_FAILURE(state, text, goto error);
1868 goto fail; /* i.e., succeed <wink/sigh> */
1869 }
1870 case Cbegbuf:
1871 {
1872 if (text == textstart)
1873 goto continue_matching;
1874 goto fail;
1875 }
1876 case Cendbuf:
1877 {
1878 if (text == textend)
1879 goto continue_matching;
1880 goto fail;
1881 }
1882 case Cwordbeg:
1883 {
1884 if (text == textend)
1885 goto fail;
1886 if (SYNTAX(*text) != Sword)
1887 goto fail;
1888 if (text == textstart)
1889 goto continue_matching;
1890 if (SYNTAX(text[-1]) != Sword)
1891 goto continue_matching;
1892 goto fail;
1893 }
1894 case Cwordend:
1895 {
1896 if (text == textstart)
1897 goto fail;
1898 if (SYNTAX(text[-1]) != Sword)
1899 goto fail;
1900 if (text == textend)
1901 goto continue_matching;
1902 if (SYNTAX(*text) == Sword)
1903 goto fail;
1904 goto continue_matching;
1905 }
1906 case Cwordbound:
1907 {
1908 /* Note: as in gnu regexp, this also matches at the
1909 * beginning and end of buffer. */
Guido van Rossum004c1e11997-05-09 02:35:58 +00001910
Guido van Rossumfaf49081997-07-15 01:47:08 +00001911 if (text == textstart || text == textend)
1912 goto continue_matching;
1913 if ((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword))
1914 goto continue_matching;
1915 goto fail;
1916 }
1917 case Cnotwordbound:
1918 {
1919 /* Note: as in gnu regexp, this never matches at the
1920 * beginning and end of buffer. */
1921 if (text == textstart || text == textend)
1922 goto fail;
1923 if (!((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword)))
1924 goto fail;
1925 goto continue_matching;
1926 }
1927 case Csyntaxspec:
1928 {
1929 NEXTCHAR(ch);
1930 if (SYNTAX(ch) != (unsigned char)*code++)
1931 goto fail;
1932 goto continue_matching;
1933 }
1934 case Cnotsyntaxspec:
1935 {
1936 NEXTCHAR(ch);
1937 if (SYNTAX(ch) != (unsigned char)*code++)
1938 break;
1939 goto continue_matching;
1940 }
1941 default:
1942 {
1943 abort();
1944 /*NOTREACHED*/
1945 }
1946 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001947
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001948#if 0 /* This line is never reached --Guido */
Guido van Rossumfaf49081997-07-15 01:47:08 +00001949 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001950#endif
Guido van Rossumfaf49081997-07-15 01:47:08 +00001951 /*
1952 *NOTREACHED
1953 */
Guido van Rossum004c1e11997-05-09 02:35:58 +00001954
1955 fail:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001956 POP_FAILURE(state, code, text, goto done_matching, goto error);
1957 goto continue_matching;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001958
1959 done_matching:
1960/* if(translated != NULL) */
1961/* free(translated); */
Guido van Rossumfaf49081997-07-15 01:47:08 +00001962 FREE_STATE(state);
1963 return -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001964
Guido van Rossum004c1e11997-05-09 02:35:58 +00001965 error:
1966/* if (translated != NULL) */
1967/* free(translated); */
Guido van Rossumfaf49081997-07-15 01:47:08 +00001968 FREE_STATE(state);
1969 return -2;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001970}
1971
1972#undef PREFETCH
1973#undef NEXTCHAR
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001974
Guido van Rossum004c1e11997-05-09 02:35:58 +00001975int re_search(regexp_t bufp,
1976 char *string,
1977 int size,
1978 int pos,
1979 int range,
1980 regexp_registers_t regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001981{
Guido van Rossumfaf49081997-07-15 01:47:08 +00001982 char *fastmap;
1983 char *translate;
1984 char *text;
1985 char *partstart;
1986 char *partend;
1987 int dir;
1988 int ret;
1989 char anchor;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001990
Guido van Rossumfaf49081997-07-15 01:47:08 +00001991 assert(size >= 0 && pos >= 0);
1992 assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001993
Guido van Rossumfaf49081997-07-15 01:47:08 +00001994 fastmap = bufp->fastmap;
1995 translate = bufp->translate;
1996 if (fastmap && !bufp->fastmap_accurate)
1997 re_compile_fastmap(bufp);
1998 anchor = bufp->anchor;
1999 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
2000 fastmap = NULL;
Guido van Rossum004c1e11997-05-09 02:35:58 +00002001
Guido van Rossumfaf49081997-07-15 01:47:08 +00002002 if (range < 0)
2003 {
2004 dir = -1;
2005 range = -range;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002006 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00002007 else
Guido van Rossumfaf49081997-07-15 01:47:08 +00002008 dir = 1;
2009
2010 if (anchor == 2)
2011 if (pos != 0)
2012 return -1;
2013 else
2014 range = 0;
2015
2016 for (; range >= 0; range--, pos += dir)
2017 {
2018 if (fastmap)
2019 {
2020 if (dir == 1)
2021 { /* searching forwards */
2022
2023 text = string + pos;
2024 partend = string + size;
2025 partstart = text;
2026 if (translate)
2027 while (text != partend &&
2028 !fastmap[(unsigned char) translate[(unsigned char)*text]])
2029 text++;
2030 else
2031 while (text != partend && !fastmap[(unsigned char)*text])
2032 text++;
2033 pos += text - partstart;
2034 range -= text - partstart;
2035 if (pos == size && bufp->can_be_null == 0)
2036 return -1;
2037 }
2038 else
2039 { /* searching backwards */
2040 text = string + pos;
2041 partstart = string + pos - range;
2042 partend = text;
2043 if (translate)
2044 while (text != partstart &&
2045 !fastmap[(unsigned char)
2046 translate[(unsigned char)*text]])
2047 text--;
2048 else
2049 while (text != partstart &&
2050 !fastmap[(unsigned char)*text])
2051 text--;
2052 pos -= partend - text;
2053 range -= partend - text;
2054 }
2055 }
2056 if (anchor == 1)
2057 { /* anchored to begline */
2058 if (pos > 0 && (string[pos - 1] != '\n'))
2059 continue;
2060 }
2061 assert(pos >= 0 && pos <= size);
2062 ret = re_match(bufp, string, size, pos, regs);
2063 if (ret >= 0)
2064 return pos;
2065 if (ret == -2)
2066 return -2;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002067 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00002068 return -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002069}