blob: 87f747fc3ebc2fdad793decee13afa421a81ccc4 [file] [log] [blame]
Guido van Rossum004c1e11997-05-09 02:35:58 +00001/* regexpr.c
2 *
3 * Author: Tatu Ylonen <ylo@ngs.fi>
4 *
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
6 *
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
11 * warranty.
12 *
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
16 *
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
20 *
21 * Emacs-specific code and syntax table code is almost directly borrowed
22 * from GNU regexp.
23 *
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
28 *
29 * $Id$ */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000030
Guido van Rossum339cfa31996-08-08 19:12:37 +000031#include "config.h" /* For Win* specific redefinition of printf c.s. */
Guido van Rossum339cfa31996-08-08 19:12:37 +000032
Guido van Rossum004c1e11997-05-09 02:35:58 +000033#include "myproto.h" /* For PROTO macro --Guido */
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000034
Guido van Rossumb674c3b1992-01-19 16:32:47 +000035#include <stdio.h>
Guido van Rossum004c1e11997-05-09 02:35:58 +000036
37#ifndef NDEBUG
38#define NDEBUG 1
39#endif
40
Guido van Rossumb674c3b1992-01-19 16:32:47 +000041#include <assert.h>
42#include "regexpr.h"
43
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000044#ifdef THINK_C
45/* Think C on the Mac really needs these headers... --Guido */
46#include <stdlib.h>
47#include <string.h>
48#else
Guido van Rossum88661e81996-05-23 22:55:58 +000049#if defined(__STDC__) || defined(_MSC_VER)
Guido van Rossum9abc5391992-03-27 17:24:37 +000050/* Don't mess around, use the standard headers */
51#include <stdlib.h>
52#include <string.h>
53#else
Guido van Rossumb674c3b1992-01-19 16:32:47 +000054char *malloc();
55void free();
56char *realloc();
Guido van Rossum9abc5391992-03-27 17:24:37 +000057#endif /* __STDC__ */
58#endif /* THINK_C */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000059
Guido van Rossum004c1e11997-05-09 02:35:58 +000060/* The stack implementation is taken from an idea by Andrew Kuchling.
61 * It's a doubly linked list of arrays. The advantages of this over a
62 * simple linked list are that the number of mallocs required are
63 * reduced. It also makes it possible to statically allocate enough
64 * space so that small patterns don't ever need to call malloc.
65 *
66 * The advantages over a single array is that is periodically
67 * realloced when more space is needed is that we avoid ever copying
68 * the stack. */
69
70/* item_t is the basic stack element. Defined as a union of
71 * structures so that both registers, failure points, and counters can
72 * be pushed/popped from the stack. There's nothing built into the
73 * item to keep track of whether a certain stack item is a register, a
74 * failure point, or a counter. */
75
76typedef union item_t
77{
78 struct
79 {
80 int num;
81 int level;
82 char *start;
83 char *end;
84 } reg;
85 struct
86 {
87 int count;
88 int level;
89 int phantom;
90 char *code;
91 char *text;
92 } fail;
93 struct
94 {
95 int num;
96 int level;
97 int count;
98 } cntr;
99} item_t;
100
101#define STACK_PAGE_SIZE 256
102#define NUM_REGISTERS 256
103
104/* A 'page' of stack items. */
105
106typedef struct item_page_t
107{
108 item_t items[STACK_PAGE_SIZE];
109 struct item_page_t *prev;
110 struct item_page_t *next;
111} item_page_t;
112
113
114typedef struct match_state
115{
116 /* Structure to encapsulate the stack. */
117 struct
118 {
119 /* index into the curent page. If index == 0 and you need
120 * to pop and item, move to the previous page and set
121 * index = STACK_PAGE_SIZE - 1. Otherwise decrement index
122 * to push a page. If index == STACK_PAGE_SIZE and you
123 * need to push a page move to the next page and set index
124 * = 0. If there is no new next page, allocate a new page
125 * and link it in. Otherwise, increment index to push a
126 * page. */
127 int index;
128 item_page_t *current; /* Pointer to the current page. */
129 item_page_t first; /* First page is statically allocated. */
130 } stack;
131 char *start[NUM_REGISTERS];
132 char *end[NUM_REGISTERS];
133
134 int changed[NUM_REGISTERS];
135 /* The number of registers that have been pushed onto the stack
136 * since the last failure point. */
137 int count;
138 /* Used to control when registers need to be pushed onto the
139 * stack. */
140 int level;
141 /* The number of failure points on the stack. */
142 int point;
143} match_state;
144
145/* Discard the top 'count' stack items. */
146
147#define STACK_DISCARD(stack, count, on_error) \
148stack.index -= count; \
149while (stack.index < 0) \
150{ \
151 if (stack.current->prev == NULL) \
152 on_error; \
153 stack.current = stack.current->prev; \
154 stack.index += STACK_PAGE_SIZE; \
155}
156
157/* Store a pointer to the previous item on the stack. Used to pop an
158 * item off of the stack. */
159
160#define STACK_PREV(stack, top, on_error) \
161if (stack.index == 0) \
162{ \
163 if (stack.current->prev == NULL) \
164 on_error; \
165 stack.current = stack.current->prev; \
166 stack.index = STACK_PAGE_SIZE - 1; \
167} \
168else \
169 stack.index--; \
170top = &(stack.current->items[stack.index])
171
172/* Store a pointer to the next item on the stack. Used to push an item
173 * on to the stack. */
174
175#define STACK_NEXT(stack, top, on_error) \
176if (stack.index == STACK_PAGE_SIZE) \
177{ \
178 if (stack.current->next == NULL) \
179 { \
Guido van Rossum445efa91997-05-14 15:30:32 +0000180 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000181 if (stack.current->next == NULL) \
182 on_error; \
183 stack.current->next->prev = stack.current; \
184 stack.current->next->next = NULL; \
185 } \
186 stack.current = stack.current->next; \
187 stack.index = 0; \
188} \
189top = &(stack.current->items[stack.index++])
190
191/* Store a pointer to the item that is 'count' items back in the
192 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
193 * STACK_TOP(stack, top, on_error). */
194
195#define STACK_BACK(stack, top, count, on_error) \
196{ \
197 int index; \
198 item_page_t *current; \
199 current = stack.current; \
200 index = stack.index - (count); \
201 while (index < 0) \
202 { \
203 if (current->prev == NULL) \
204 on_error; \
205 current = current->prev; \
206 index += STACK_PAGE_SIZE; \
207 } \
208 top = &(current->items[index]); \
209}
210
211/* Store a pointer to the top item on the stack. Execute the
212 * 'on_error' code if there are no items on the stack. */
213
214#define STACK_TOP(stack, top, on_error) \
215if (stack.index == 0) \
216{ \
217 if (stack.current->prev == NULL) \
218 on_error; \
219 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
220} \
221else \
222 top = &(stack.current->items[stack.index - 1])
223
224/* Test to see if the stack is empty */
225
226#define STACK_EMPTY(stack) ((stack.index == 0) && \
227 (stack.current->prev == NULL))
228
229
230/* Initialize a state object */
231
232#define NEW_STATE(state) \
233memset(&state, 0, sizeof(match_state)); \
234state.stack.current = &state.stack.first; \
235state.level = 1
236
237/* Free any memory that might have been malloc'd */
238
239#define FREE_STATE(state) \
240while(state.stack.first.next != NULL) \
241{ \
242 state.stack.current = state.stack.first.next; \
243 state.stack.first.next = state.stack.current->next; \
244 free(state.stack.current); \
245}
246
247/* Return the start of register 'reg' */
248
249#define GET_REG_START(state, reg) (state.start[reg])
250
251/* Return the end of register 'reg' */
252
253#define GET_REG_END(state, reg) (state.end[reg])
254
255/* Set the start of register 'reg'. If the state of the register needs
256 * saving, push it on the stack. */
257
258#define SET_REG_START(state, reg, text, on_error) \
259if(state.changed[reg] < state.level) \
260{ \
261 item_t *item; \
262 STACK_NEXT(state.stack, item, on_error); \
263 item->reg.num = reg; \
264 item->reg.start = state.start[reg]; \
265 item->reg.end = state.end[reg]; \
266 item->reg.level = state.changed[reg]; \
267 state.changed[reg] = state.level; \
268 state.count++; \
269} \
270state.start[reg] = text
271
272/* Set the end of register 'reg'. If the state of the register needs
273 * saving, push it on the stack. */
274
275#define SET_REG_END(state, reg, text, on_error) \
276if(state.changed[reg] < state.level) \
277{ \
278 item_t *item; \
279 STACK_NEXT(state.stack, item, on_error); \
280 item->reg.num = reg; \
281 item->reg.start = state.start[reg]; \
282 item->reg.end = state.end[reg]; \
283 item->reg.level = state.changed[reg]; \
284 state.changed[reg] = state.level; \
285 state.count++; \
286} \
287state.end[reg] = text
288
289#define PUSH_FAILURE(state, xcode, xtext, on_error) \
290{ \
291 item_t *item; \
292 STACK_NEXT(state.stack, item, on_error); \
293 item->fail.code = xcode; \
294 item->fail.text = xtext; \
295 item->fail.count = state.count; \
296 item->fail.level = state.level; \
297 item->fail.phantom = 0; \
298 state.count = 0; \
299 state.level++; \
300 state.point++; \
301}
302
303/* Update the last failure point with a new position in the text. */
304
305/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
306/* { \ */
307/* item_t *item; \ */
308/* STACK_DISCARD(state.stack, state.count, on_error); \ */
309/* STACK_TOP(state.stack, item, on_error); \ */
310/* item->fail.text = xtext; \ */
311/* state.count = 0; \ */
312/* } */
313
314/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
315/* { \ */
316/* item_t *item; \ */
317/* STACK_BACK(state.stack, item, state.count + 1, on_error); \ */
318/* item->fail.text = xtext; \ */
319/* } */
320
321#define UPDATE_FAILURE(state, xtext, on_error) \
322{ \
323 item_t *item; \
324 STACK_BACK(state.stack, item, state.count + 1, on_error); \
325 if (!item->fail.phantom) \
326 { \
327 item_t *item2; \
328 STACK_NEXT(state.stack, item2, on_error); \
329 item2->fail.code = item->fail.code; \
330 item2->fail.text = xtext; \
331 item2->fail.count = state.count; \
332 item2->fail.level = state.level; \
333 item2->fail.phantom = 1; \
334 state.count = 0; \
335 state.level++; \
336 state.point++; \
337 } \
338 else \
339 { \
340 STACK_DISCARD(state.stack, state.count, on_error); \
341 STACK_TOP(state.stack, item, on_error); \
342 item->fail.text = xtext; \
343 state.count = 0; \
344 state.level++; \
345 } \
346}
347
348#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
349{ \
350 item_t *item; \
351 do \
352 { \
353 while(state.count > 0) \
354 { \
355 STACK_PREV(state.stack, item, on_error); \
356 state.start[item->reg.num] = item->reg.start; \
357 state.end[item->reg.num] = item->reg.end; \
358 state.changed[item->reg.num] = item->reg.level; \
359 state.count--; \
360 } \
361 STACK_PREV(state.stack, item, on_empty); \
362 xcode = item->fail.code; \
363 xtext = item->fail.text; \
364 state.count = item->fail.count; \
365 state.level = item->fail.level; \
366 state.point--; \
367 } \
368 while (item->fail.text == NULL); \
369}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000370
371enum regexp_compiled_ops /* opcodes for compiled regexp */
372{
373 Cend, /* end of pattern reached */
374 Cbol, /* beginning of line */
375 Ceol, /* end of line */
376 Cset, /* character set. Followed by 32 bytes of set. */
377 Cexact, /* followed by a byte to match */
378 Canychar, /* matches any character except newline */
379 Cstart_memory, /* set register start addr (followed by reg number) */
380 Cend_memory, /* set register end addr (followed by reg number) */
381 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
382 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
383 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
384 Cfailure_jump, /* jump to addr on failure */
385 Cupdate_failure_jump, /* update topmost failure point and jump */
386 Cdummy_failure_jump, /* push a dummy failure point and jump */
387 Cbegbuf, /* match at beginning of buffer */
388 Cendbuf, /* match at end of buffer */
389 Cwordbeg, /* match at beginning of word */
390 Cwordend, /* match at end of word */
391 Cwordbound, /* match if at word boundary */
392 Cnotwordbound, /* match if not at word boundary */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000393 Csyntaxspec, /* matches syntax code (1 byte follows) */
394 Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
395};
396
397enum regexp_syntax_op /* syntax codes for plain and quoted characters */
398{
399 Rend, /* special code for end of regexp */
400 Rnormal, /* normal character */
401 Ranychar, /* any character except newline */
402 Rquote, /* the quote character */
403 Rbol, /* match beginning of line */
404 Reol, /* match end of line */
405 Roptional, /* match preceding expression optionally */
406 Rstar, /* match preceding expr zero or more times */
407 Rplus, /* match preceding expr one or more times */
408 Ror, /* match either of alternatives */
409 Ropenpar, /* opening parenthesis */
410 Rclosepar, /* closing parenthesis */
411 Rmemory, /* match memory register */
412 Rextended_memory, /* \vnn to match registers 10-99 */
413 Ropenset, /* open set. Internal syntax hard-coded below. */
414 /* the following are gnu extensions to "normal" regexp syntax */
415 Rbegbuf, /* beginning of buffer */
416 Rendbuf, /* end of buffer */
417 Rwordchar, /* word character */
418 Rnotwordchar, /* not word character */
419 Rwordbeg, /* beginning of word */
420 Rwordend, /* end of word */
421 Rwordbound, /* word bound */
422 Rnotwordbound, /* not word bound */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000423 Rnum_ops
424};
425
426static int re_compile_initialized = 0;
427static int regexp_syntax = 0;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000428int re_syntax = 0; /* Exported copy of regexp_syntax */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000429static unsigned char regexp_plain_ops[256];
430static unsigned char regexp_quoted_ops[256];
431static unsigned char regexp_precedences[Rnum_ops];
432static int regexp_context_indep_ops;
433static int regexp_ansi_sequences;
434
435#define NUM_LEVELS 5 /* number of precedence levels in use */
436#define MAX_NESTING 100 /* max nesting level of operators */
437
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000438#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
439#define Sword 1
440
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000441static char re_syntax_table[256];
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000442
Guido van Rossum004c1e11997-05-09 02:35:58 +0000443static void re_compile_initialize(void)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000444{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000445 int a;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000446
Guido van Rossum004c1e11997-05-09 02:35:58 +0000447 static int syntax_table_inited = 0;
448
449 if (!syntax_table_inited)
450 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000451 syntax_table_inited = 1;
452 memset(re_syntax_table, 0, 256);
453 for (a = 'a'; a <= 'z'; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000454 re_syntax_table[a] = Sword;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000455 for (a = 'A'; a <= 'Z'; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000456 re_syntax_table[a] = Sword;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000457 for (a = '0'; a <= '9'; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000458 re_syntax_table[a] = Sword;
459 }
460 re_compile_initialized = 1;
461 for (a = 0; a < 256; a++)
462 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000463 regexp_plain_ops[a] = Rnormal;
464 regexp_quoted_ops[a] = Rnormal;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000465 }
466 for (a = '0'; a <= '9'; a++)
467 regexp_quoted_ops[a] = Rmemory;
468 regexp_plain_ops['\134'] = Rquote;
469 if (regexp_syntax & RE_NO_BK_PARENS)
470 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000471 regexp_plain_ops['('] = Ropenpar;
472 regexp_plain_ops[')'] = Rclosepar;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000473 }
474 else
475 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000476 regexp_quoted_ops['('] = Ropenpar;
477 regexp_quoted_ops[')'] = Rclosepar;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000478 }
479 if (regexp_syntax & RE_NO_BK_VBAR)
480 regexp_plain_ops['\174'] = Ror;
481 else
482 regexp_quoted_ops['\174'] = Ror;
483 regexp_plain_ops['*'] = Rstar;
484 if (regexp_syntax & RE_BK_PLUS_QM)
485 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000486 regexp_quoted_ops['+'] = Rplus;
487 regexp_quoted_ops['?'] = Roptional;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000488 }
489 else
490 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000491 regexp_plain_ops['+'] = Rplus;
492 regexp_plain_ops['?'] = Roptional;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000493 }
494 if (regexp_syntax & RE_NEWLINE_OR)
495 regexp_plain_ops['\n'] = Ror;
496 regexp_plain_ops['\133'] = Ropenset;
497 regexp_plain_ops['\136'] = Rbol;
498 regexp_plain_ops['$'] = Reol;
499 regexp_plain_ops['.'] = Ranychar;
500 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
501 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000502 regexp_quoted_ops['w'] = Rwordchar;
503 regexp_quoted_ops['W'] = Rnotwordchar;
504 regexp_quoted_ops['<'] = Rwordbeg;
505 regexp_quoted_ops['>'] = Rwordend;
506 regexp_quoted_ops['b'] = Rwordbound;
507 regexp_quoted_ops['B'] = Rnotwordbound;
508 regexp_quoted_ops['`'] = Rbegbuf;
509 regexp_quoted_ops['\''] = Rendbuf;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000510 }
511 if (regexp_syntax & RE_ANSI_HEX)
512 regexp_quoted_ops['v'] = Rextended_memory;
513 for (a = 0; a < Rnum_ops; a++)
514 regexp_precedences[a] = 4;
515 if (regexp_syntax & RE_TIGHT_VBAR)
516 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000517 regexp_precedences[Ror] = 3;
518 regexp_precedences[Rbol] = 2;
519 regexp_precedences[Reol] = 2;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000520 }
521 else
522 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000523 regexp_precedences[Ror] = 2;
524 regexp_precedences[Rbol] = 3;
525 regexp_precedences[Reol] = 3;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000526 }
527 regexp_precedences[Rclosepar] = 1;
528 regexp_precedences[Rend] = 0;
529 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
530 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000531}
532
Guido van Rossum004c1e11997-05-09 02:35:58 +0000533int re_set_syntax(int syntax)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000534{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000535 int ret;
536
537 ret = regexp_syntax;
538 regexp_syntax = syntax;
539 re_syntax = syntax; /* Exported copy */
540 re_compile_initialize();
541 return ret;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000542}
543
Guido van Rossum004c1e11997-05-09 02:35:58 +0000544static int hex_char_to_decimal(int ch)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000545{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000546 if (ch >= '0' && ch <= '9')
547 return ch - '0';
548 if (ch >= 'a' && ch <= 'f')
549 return ch - 'a' + 10;
550 if (ch >= 'A' && ch <= 'F')
551 return ch - 'A' + 10;
552 return 16;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000553}
554
Guido van Rossum004c1e11997-05-09 02:35:58 +0000555static void re_compile_fastmap_aux(char *code,
556 int pos,
557 char *visited,
558 char *can_be_null,
559 char *fastmap)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000560{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000561 int a;
562 int b;
563 int syntaxcode;
564
565 if (visited[pos])
566 return; /* we have already been here */
567 visited[pos] = 1;
568 for (;;)
569 switch (code[pos++])
570 {
571 case Cend:
572 {
573 *can_be_null = 1;
574 return;
575 }
576 case Cbol:
577 case Cbegbuf:
578 case Cendbuf:
579 case Cwordbeg:
580 case Cwordend:
581 case Cwordbound:
582 case Cnotwordbound:
583 {
584 break;
585 }
586 case Csyntaxspec:
587 {
588 syntaxcode = code[pos++];
589 for (a = 0; a < 256; a++)
590 if (SYNTAX(a) == syntaxcode)
591 fastmap[a] = 1;
592 return;
593 }
594 case Cnotsyntaxspec:
595 {
596 syntaxcode = code[pos++];
597 for (a = 0; a < 256; a++)
598 if (SYNTAX(a) != syntaxcode)
599 fastmap[a] = 1;
600 return;
601 }
602 case Ceol:
603 {
604 fastmap['\n'] = 1;
605 if (*can_be_null == 0)
606 *can_be_null = 2; /* can match null, but only at end of buffer*/
607 return;
608 }
609 case Cset:
610 {
611 for (a = 0; a < 256/8; a++)
612 if (code[pos + a] != 0)
613 for (b = 0; b < 8; b++)
614 if (code[pos + a] & (1 << b))
615 fastmap[(a << 3) + b] = 1;
616 pos += 256/8;
617 return;
618 }
619 case Cexact:
620 {
621 fastmap[(unsigned char)code[pos]] = 1;
622 return;
623 }
624 case Canychar:
625 {
626 for (a = 0; a < 256; a++)
627 if (a != '\n')
628 fastmap[a] = 1;
629 return;
630 }
631 case Cstart_memory:
632 case Cend_memory:
633 {
634 pos++;
635 break;
636 }
637 case Cmatch_memory:
638 {
639 for (a = 0; a < 256; a++)
640 fastmap[a] = 1;
641 *can_be_null = 1;
642 return;
643 }
644 case Cjump:
645 case Cdummy_failure_jump:
646 case Cupdate_failure_jump:
647 case Cstar_jump:
648 {
649 a = (unsigned char)code[pos++];
650 a |= (unsigned char)code[pos++] << 8;
651 pos += (int)(short)a;
652 if (visited[pos])
653 {
654 /* argh... the regexp contains empty loops. This is not
655 good, as this may cause a failure stack overflow when
656 matching. Oh well. */
657 /* this path leads nowhere; pursue other paths. */
658 return;
659 }
660 visited[pos] = 1;
661 break;
662 }
663 case Cfailure_jump:
664 {
665 a = (unsigned char)code[pos++];
666 a |= (unsigned char)code[pos++] << 8;
667 a = pos + (int)(short)a;
668 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
669 break;
670 }
671 default:
672 {
673 abort(); /* probably some opcode is missing from this switch */
674 /*NOTREACHED*/
675 }
676 }
677}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000678
Guido van Rossum004c1e11997-05-09 02:35:58 +0000679static int re_do_compile_fastmap(char *buffer,
680 int used,
681 int pos,
682 char *can_be_null,
683 char *fastmap)
684{
685 char small_visited[512], *visited;
686
687 if (used <= sizeof(small_visited))
688 visited = small_visited;
689 else
690 {
691 visited = malloc(used);
692 if (!visited)
693 return 0;
694 }
695 *can_be_null = 0;
696 memset(fastmap, 0, 256);
697 memset(visited, 0, used);
698 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
699 if (visited != small_visited)
700 free(visited);
701 return 1;
702}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000703
Guido van Rossum004c1e11997-05-09 02:35:58 +0000704void re_compile_fastmap(regexp_t bufp)
705{
706 if (!bufp->fastmap || bufp->fastmap_accurate)
707 return;
708 assert(bufp->used > 0);
709 if (!re_do_compile_fastmap(bufp->buffer,
710 bufp->used,
711 0,
712 &bufp->can_be_null,
713 bufp->fastmap))
714 return;
715 if (bufp->buffer[0] == Cbol)
716 bufp->anchor = 1; /* begline */
717 else
718 if (bufp->buffer[0] == Cbegbuf)
719 bufp->anchor = 2; /* begbuf */
720 else
721 bufp->anchor = 0; /* none */
722 bufp->fastmap_accurate = 1;
723}
724
725/*
726 * star is coded as:
727 * 1: failure_jump 2
728 * ... code for operand of star
729 * star_jump 1
730 * 2: ... code after star
731 *
732 * We change the star_jump to update_failure_jump if we can determine
733 * that it is safe to do so; otherwise we change it to an ordinary
734 * jump.
735 *
736 * plus is coded as
737 *
738 * jump 2
739 * 1: failure_jump 3
740 * 2: ... code for operand of plus
741 * star_jump 1
742 * 3: ... code after plus
743 *
744 * For star_jump considerations this is processed identically to star.
745 *
746 */
747
748static int re_optimize_star_jump(regexp_t bufp, char *code)
749{
750 char map[256];
751 char can_be_null;
752 char *p1;
753 char *p2;
754 char ch;
755 int a;
756 int b;
757
758 a = (unsigned char)*code++;
759 a |= (unsigned char)*code++ << 8;
760 a = (int)(short)a;
761
762 p1 = code + a + 3; /* skip the failure_jump */
763 assert(p1[-3] == Cfailure_jump);
764 p2 = code;
765 /* p1 points inside loop, p2 points to after loop */
766 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
767 p2 - bufp->buffer, &can_be_null, map))
768 goto make_normal_jump;
769
770 /* If we might introduce a new update point inside the
771 * loop, we can't optimize because then update_jump would
772 * update a wrong failure point. Thus we have to be
773 * quite careful here.
774 */
775
776 /* loop until we find something that consumes a character */
777 loop_p1:
778 switch (*p1++)
779 {
780 case Cbol:
781 case Ceol:
782 case Cbegbuf:
783 case Cendbuf:
784 case Cwordbeg:
785 case Cwordend:
786 case Cwordbound:
787 case Cnotwordbound:
788 {
789 goto loop_p1;
790 }
791 case Cstart_memory:
792 case Cend_memory:
793 {
794 p1++;
795 goto loop_p1;
796 }
797 case Cexact:
798 {
799 ch = (unsigned char)*p1++;
Guido van Rossum4917d931997-05-13 17:53:34 +0000800 if (map[(int)ch])
Guido van Rossum004c1e11997-05-09 02:35:58 +0000801 goto make_normal_jump;
802 break;
803 }
804 case Canychar:
805 {
806 for (b = 0; b < 256; b++)
807 if (b != '\n' && map[b])
808 goto make_normal_jump;
809 break;
810 }
811 case Cset:
812 {
813 for (b = 0; b < 256; b++)
814 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
815 goto make_normal_jump;
816 p1 += 256/8;
817 break;
818 }
819 default:
820 {
821 goto make_normal_jump;
822 }
823 }
824 /* now we know that we can't backtrack. */
825 while (p1 != p2 - 3)
826 {
827 switch (*p1++)
828 {
829 case Cend:
830 {
831 return 0;
832 }
833 case Cbol:
834 case Ceol:
835 case Canychar:
836 case Cbegbuf:
837 case Cendbuf:
838 case Cwordbeg:
839 case Cwordend:
840 case Cwordbound:
841 case Cnotwordbound:
842 {
843 break;
844 }
845 case Cset:
846 {
847 p1 += 256/8;
848 break;
849 }
850 case Cexact:
851 case Cstart_memory:
852 case Cend_memory:
853 case Cmatch_memory:
854 case Csyntaxspec:
855 case Cnotsyntaxspec:
856 {
857 p1++;
858 break;
859 }
860 case Cjump:
861 case Cstar_jump:
862 case Cfailure_jump:
863 case Cupdate_failure_jump:
864 case Cdummy_failure_jump:
865 {
866 goto make_normal_jump;
867 }
868 default:
869 {
870 return 0;
871 break;
872 }
873 }
874 }
875
Guido van Rossum004c1e11997-05-09 02:35:58 +0000876 code -= 3;
877 a += 3; /* jump to after the Cfailure_jump */
878 code[0] = Cupdate_failure_jump;
879 code[1] = a & 0xff;
880 code[2] = a >> 8;
881 return 1;
882
883 make_normal_jump:
884 code -= 3;
885 *code = Cjump;
886 return 1;
887}
888
889static int re_optimize(regexp_t bufp)
890{
891 char *code;
892
893 code = bufp->buffer;
894
895 while(1)
896 {
897 switch (*code++)
898 {
899 case Cend:
900 {
901 return 1;
902 }
903 case Canychar:
904 case Cbol:
905 case Ceol:
906 case Cbegbuf:
907 case Cendbuf:
908 case Cwordbeg:
909 case Cwordend:
910 case Cwordbound:
911 case Cnotwordbound:
912 {
913 break;
914 }
915 case Cset:
916 {
917 code += 256/8;
918 break;
919 }
920 case Cexact:
921 case Cstart_memory:
922 case Cend_memory:
923 case Cmatch_memory:
924 case Csyntaxspec:
925 case Cnotsyntaxspec:
926 {
927 code++;
928 break;
929 }
930 case Cstar_jump:
931 {
932 if (!re_optimize_star_jump(bufp, code))
933 {
934 return 0;
935 }
936 /* fall through */
937 }
938 case Cupdate_failure_jump:
939 case Cjump:
940 case Cdummy_failure_jump:
941 case Cfailure_jump:
942 {
943 code += 2;
944 break;
945 }
946 default:
947 {
948 return 0;
949 }
950 }
951 }
952}
953
954#define NEXTCHAR(var) \
955{ \
956 if (pos >= size) \
957 goto ends_prematurely; \
958 (var) = regex[pos]; \
959 pos++; \
960}
961
962#define ALLOC(amount) \
963{ \
964 if (pattern_offset+(amount) > alloc) \
965 { \
966 alloc += 256 + (amount); \
967 pattern = realloc(pattern, alloc); \
968 if (!pattern) \
969 goto out_of_memory; \
970 } \
971}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000972
973#define STORE(ch) pattern[pattern_offset++] = (ch)
974
975#define CURRENT_LEVEL_START (starts[starts_base + current_level])
976
977#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
978
Guido van Rossum004c1e11997-05-09 02:35:58 +0000979#define PUSH_LEVEL_STARTS \
980 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
981 starts_base += NUM_LEVELS; \
982 else \
983 goto too_complex
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000984
985#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
986
Guido van Rossum004c1e11997-05-09 02:35:58 +0000987#define PUT_ADDR(offset,addr) \
988{ \
989 int disp = (addr) - (offset) - 2; \
990 pattern[(offset)] = disp & 0xff; \
991 pattern[(offset)+1] = (disp>>8) & 0xff; \
992}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000993
Guido van Rossum004c1e11997-05-09 02:35:58 +0000994#define INSERT_JUMP(pos,type,addr) \
995{ \
996 int a, p = (pos), t = (type), ad = (addr); \
997 for (a = pattern_offset - 1; a >= p; a--) \
998 pattern[a + 3] = pattern[a]; \
999 pattern[p] = t; \
1000 PUT_ADDR(p+1,ad); \
1001 pattern_offset += 3; \
1002}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001003#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
1004
Guido van Rossum004c1e11997-05-09 02:35:58 +00001005#define SET_FIELDS \
1006{ \
1007 bufp->allocated = alloc; \
1008 bufp->buffer = pattern; \
1009 bufp->used = pattern_offset; \
1010}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001011
Guido van Rossum004c1e11997-05-09 02:35:58 +00001012#define GETHEX(var) \
1013{ \
1014 char gethex_ch, gethex_value; \
1015 NEXTCHAR(gethex_ch); \
1016 gethex_value = hex_char_to_decimal(gethex_ch); \
1017 if (gethex_value == 16) \
1018 goto hex_error; \
1019 NEXTCHAR(gethex_ch); \
1020 gethex_ch = hex_char_to_decimal(gethex_ch); \
1021 if (gethex_ch == 16) \
1022 goto hex_error; \
1023 (var) = gethex_value * 16 + gethex_ch; \
1024}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001025
Guido van Rossum004c1e11997-05-09 02:35:58 +00001026#define ANSI_TRANSLATE(ch) \
1027{ \
1028 switch (ch) \
1029 { \
1030 case 'a': \
1031 case 'A': \
1032 { \
1033 ch = 7; /* audible bell */ \
1034 break; \
1035 } \
1036 case 'b': \
1037 case 'B': \
1038 { \
1039 ch = 8; /* backspace */ \
1040 break; \
1041 } \
1042 case 'f': \
1043 case 'F': \
1044 { \
1045 ch = 12; /* form feed */ \
1046 break; \
1047 } \
1048 case 'n': \
1049 case 'N': \
1050 { \
1051 ch = 10; /* line feed */ \
1052 break; \
1053 } \
1054 case 'r': \
1055 case 'R': \
1056 { \
1057 ch = 13; /* carriage return */ \
1058 break; \
1059 } \
1060 case 't': \
1061 case 'T': \
1062 { \
1063 ch = 9; /* tab */ \
1064 break; \
1065 } \
1066 case 'v': \
1067 case 'V': \
1068 { \
1069 ch = 11; /* vertical tab */ \
1070 break; \
1071 } \
1072 case 'x': /* hex code */ \
1073 case 'X': \
1074 { \
1075 GETHEX(ch); \
1076 break; \
1077 } \
1078 default: \
1079 { \
1080 /* other characters passed through */ \
1081 if (translate) \
1082 ch = translate[(unsigned char)ch]; \
1083 break; \
1084 } \
1085 } \
1086}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001087
Guido van Rossum004c1e11997-05-09 02:35:58 +00001088char *re_compile_pattern(char *regex, int size, regexp_t bufp)
1089{
1090 int a;
1091 int pos;
1092 int op;
1093 int current_level;
1094 int level;
1095 int opcode;
Guido van Rossum4917d931997-05-13 17:53:34 +00001096 int pattern_offset = 0, alloc;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001097 int starts[NUM_LEVELS * MAX_NESTING];
1098 int starts_base;
1099 int future_jumps[MAX_NESTING];
1100 int num_jumps;
Guido van Rossum4917d931997-05-13 17:53:34 +00001101 unsigned char ch = '\0';
Guido van Rossum004c1e11997-05-09 02:35:58 +00001102 char *pattern;
1103 char *translate;
1104 int next_register;
1105 int paren_depth;
1106 int num_open_registers;
1107 int open_registers[RE_NREGS];
1108 int beginning_context;
1109
1110 if (!re_compile_initialized)
1111 re_compile_initialize();
1112 bufp->used = 0;
1113 bufp->fastmap_accurate = 0;
1114 bufp->uses_registers = 0;
1115 translate = bufp->translate;
1116 pattern = bufp->buffer;
1117 alloc = bufp->allocated;
1118 if (alloc == 0 || pattern == NULL)
1119 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001120 alloc = 256;
1121 pattern = malloc(alloc);
1122 if (!pattern)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001123 goto out_of_memory;
1124 }
1125 pattern_offset = 0;
1126 starts_base = 0;
1127 num_jumps = 0;
1128 current_level = 0;
1129 SET_LEVEL_START;
1130 num_open_registers = 0;
1131 next_register = 1;
1132 paren_depth = 0;
1133 beginning_context = 1;
1134 op = -1;
1135 /* we use Rend dummy to ensure that pending jumps are updated (due to
1136 low priority of Rend) before exiting the loop. */
1137 pos = 0;
1138 while (op != Rend)
1139 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001140 if (pos >= size)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001141 op = Rend;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001142 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001143 {
1144 NEXTCHAR(ch);
1145 if (translate)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001146 ch = translate[(unsigned char)ch];
Guido van Rossum004c1e11997-05-09 02:35:58 +00001147 op = regexp_plain_ops[(unsigned char)ch];
1148 if (op == Rquote)
1149 {
1150 NEXTCHAR(ch);
1151 op = regexp_quoted_ops[(unsigned char)ch];
1152 if (op == Rnormal && regexp_ansi_sequences)
1153 ANSI_TRANSLATE(ch);
1154 }
1155 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001156 level = regexp_precedences[op];
1157 /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
Guido van Rossum004c1e11997-05-09 02:35:58 +00001158 ch, op, level, current_level, CURRENT_LEVEL_START); */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001159 if (level > current_level)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001160 {
1161 for (current_level++; current_level < level; current_level++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001162 SET_LEVEL_START;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001163 SET_LEVEL_START;
1164 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001165 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001166 if (level < current_level)
1167 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001168 current_level = level;
1169 for (;num_jumps > 0 &&
Guido van Rossum004c1e11997-05-09 02:35:58 +00001170 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001171 num_jumps--)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001172 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
1173 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001174 switch (op)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001175 {
1176 case Rend:
1177 {
1178 break;
1179 }
1180 case Rnormal:
1181 {
1182 normal_char:
1183 opcode = Cexact;
1184 store_opcode_and_arg: /* opcode & ch must be set */
1185 SET_LEVEL_START;
1186 ALLOC(2);
1187 STORE(opcode);
1188 STORE(ch);
1189 break;
1190 }
1191 case Ranychar:
1192 {
1193 opcode = Canychar;
1194 store_opcode:
1195 SET_LEVEL_START;
1196 ALLOC(1);
1197 STORE(opcode);
1198 break;
1199 }
1200 case Rquote:
1201 {
1202 abort();
1203 /*NOTREACHED*/
1204 }
1205 case Rbol:
1206 {
1207 if (!beginning_context)
1208 if (regexp_context_indep_ops)
1209 goto op_error;
1210 else
1211 goto normal_char;
1212 opcode = Cbol;
1213 goto store_opcode;
1214 }
1215 case Reol:
1216 {
1217 if (!((pos >= size) ||
1218 ((regexp_syntax & RE_NO_BK_VBAR) ?
1219 (regex[pos] == '\174') :
1220 (pos+1 < size && regex[pos] == '\134' &&
1221 regex[pos+1] == '\174')) ||
1222 ((regexp_syntax & RE_NO_BK_PARENS)?
1223 (regex[pos] == ')'):
1224 (pos+1 < size && regex[pos] == '\134' &&
1225 regex[pos+1] == ')'))))
1226 if (regexp_context_indep_ops)
1227 goto op_error;
1228 else
1229 goto normal_char;
1230 opcode = Ceol;
1231 goto store_opcode;
1232 /* NOTREACHED */
1233 break;
1234 }
1235 case Roptional:
1236 {
1237 if (beginning_context)
1238 if (regexp_context_indep_ops)
1239 goto op_error;
1240 else
1241 goto normal_char;
1242 if (CURRENT_LEVEL_START == pattern_offset)
1243 break; /* ignore empty patterns for ? */
1244 ALLOC(3);
1245 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1246 pattern_offset + 3);
1247 break;
1248 }
1249 case Rstar:
1250 case Rplus:
1251 {
1252 if (beginning_context)
1253 if (regexp_context_indep_ops)
1254 goto op_error;
1255 else
1256 goto normal_char;
1257 if (CURRENT_LEVEL_START == pattern_offset)
1258 break; /* ignore empty patterns for + and * */
1259 ALLOC(9);
1260 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1261 pattern_offset + 6);
1262 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1263 if (op == Rplus) /* jump over initial failure_jump */
1264 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1265 CURRENT_LEVEL_START + 6);
1266 break;
1267 }
1268 case Ror:
1269 {
1270 ALLOC(6);
1271 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1272 pattern_offset + 6);
1273 if (num_jumps >= MAX_NESTING)
1274 goto too_complex;
1275 STORE(Cjump);
1276 future_jumps[num_jumps++] = pattern_offset;
1277 STORE(0);
1278 STORE(0);
1279 SET_LEVEL_START;
1280 break;
1281 }
1282 case Ropenpar:
1283 {
1284 SET_LEVEL_START;
1285 if (next_register < RE_NREGS)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001286 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001287 bufp->uses_registers = 1;
1288 ALLOC(2);
1289 STORE(Cstart_memory);
1290 STORE(next_register);
1291 open_registers[num_open_registers++] = next_register;
1292 next_register++;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001293 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001294 paren_depth++;
1295 PUSH_LEVEL_STARTS;
1296 current_level = 0;
1297 SET_LEVEL_START;
1298 break;
1299 }
1300 case Rclosepar:
1301 {
1302 if (paren_depth <= 0)
1303 goto parenthesis_error;
1304 POP_LEVEL_STARTS;
1305 current_level = regexp_precedences[Ropenpar];
1306 paren_depth--;
1307 if (paren_depth < num_open_registers)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001308 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001309 bufp->uses_registers = 1;
1310 ALLOC(2);
1311 STORE(Cend_memory);
1312 num_open_registers--;
1313 STORE(open_registers[num_open_registers]);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001314 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001315 break;
1316 }
1317 case Rmemory:
1318 {
1319 if (ch == '0')
1320 goto bad_match_register;
1321 assert(ch >= '0' && ch <= '9');
1322 bufp->uses_registers = 1;
1323 opcode = Cmatch_memory;
1324 ch -= '0';
1325 goto store_opcode_and_arg;
1326 }
1327 case Rextended_memory:
1328 {
1329 NEXTCHAR(ch);
1330 if (ch < '0' || ch > '9')
1331 goto bad_match_register;
1332 NEXTCHAR(a);
1333 if (a < '0' || a > '9')
1334 goto bad_match_register;
1335 ch = 10 * (a - '0') + ch - '0';
1336 if (ch <= 0 || ch >= RE_NREGS)
1337 goto bad_match_register;
1338 bufp->uses_registers = 1;
1339 opcode = Cmatch_memory;
1340 goto store_opcode_and_arg;
1341 }
1342 case Ropenset:
1343 {
1344 int complement;
1345 int prev;
1346 int offset;
1347 int range;
1348 int firstchar;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001349
1350 SET_LEVEL_START;
1351 ALLOC(1+256/8);
1352 STORE(Cset);
1353 offset = pattern_offset;
1354 for (a = 0; a < 256/8; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001355 STORE(0);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001356 NEXTCHAR(ch);
1357 if (translate)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001358 ch = translate[(unsigned char)ch];
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001359 if (ch == '\136')
Guido van Rossum004c1e11997-05-09 02:35:58 +00001360 {
1361 complement = 1;
1362 NEXTCHAR(ch);
1363 if (translate)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001364 ch = translate[(unsigned char)ch];
Guido van Rossum004c1e11997-05-09 02:35:58 +00001365 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001366 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001367 complement = 0;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001368 prev = -1;
1369 range = 0;
1370 firstchar = 1;
1371 while (ch != '\135' || firstchar)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001372 {
1373 firstchar = 0;
1374 if (regexp_ansi_sequences && ch == '\134')
1375 {
1376 NEXTCHAR(ch);
1377 ANSI_TRANSLATE(ch);
1378 }
1379 if (range)
1380 {
1381 for (a = prev; a <= (int)ch; a++)
1382 SETBIT(pattern, offset, a);
1383 prev = -1;
1384 range = 0;
1385 }
1386 else
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001387 if (prev != -1 && ch == '-')
Guido van Rossum004c1e11997-05-09 02:35:58 +00001388 range = 1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001389 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001390 {
1391 SETBIT(pattern, offset, ch);
1392 prev = ch;
1393 }
1394 NEXTCHAR(ch);
1395 if (translate)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001396 ch = translate[(unsigned char)ch];
Guido van Rossum004c1e11997-05-09 02:35:58 +00001397 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001398 if (range)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001399 SETBIT(pattern, offset, '-');
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001400 if (complement)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001401 {
1402 for (a = 0; a < 256/8; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001403 pattern[offset+a] ^= 0xff;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001404 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001405 break;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001406 }
1407 case Rbegbuf:
1408 {
1409 opcode = Cbegbuf;
1410 goto store_opcode;
1411 }
1412 case Rendbuf:
1413 {
1414 opcode = Cendbuf;
1415 goto store_opcode;
1416 }
1417 case Rwordchar:
1418 {
1419 opcode = Csyntaxspec;
1420 ch = Sword;
1421 goto store_opcode_and_arg;
1422 }
1423 case Rnotwordchar:
1424 {
1425 opcode = Cnotsyntaxspec;
1426 ch = Sword;
1427 goto store_opcode_and_arg;
1428 }
1429 case Rwordbeg:
1430 {
1431 opcode = Cwordbeg;
1432 goto store_opcode;
1433 }
1434 case Rwordend:
1435 {
1436 opcode = Cwordend;
1437 goto store_opcode;
1438 }
1439 case Rwordbound:
1440 {
1441 opcode = Cwordbound;
1442 goto store_opcode;
1443 }
1444 case Rnotwordbound:
1445 {
1446 opcode = Cnotwordbound;
1447 goto store_opcode;
1448 }
1449 default:
1450 {
1451 abort();
1452 }
1453 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001454 beginning_context = (op == Ropenpar || op == Ror);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001455 }
1456 if (starts_base != 0)
1457 goto parenthesis_error;
1458 assert(num_jumps == 0);
1459 ALLOC(1);
1460 STORE(Cend);
1461 SET_FIELDS;
1462 if(!re_optimize(bufp))
1463 return "Optimization error";
1464 return NULL;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001465
Guido van Rossum004c1e11997-05-09 02:35:58 +00001466 op_error:
1467 SET_FIELDS;
1468 return "Badly placed special character";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001469
Guido van Rossum004c1e11997-05-09 02:35:58 +00001470 bad_match_register:
1471 SET_FIELDS;
1472 return "Bad match register number";
1473
1474 hex_error:
1475 SET_FIELDS;
1476 return "Bad hexadecimal number";
1477
1478 parenthesis_error:
1479 SET_FIELDS;
1480 return "Badly placed parenthesis";
1481
1482 out_of_memory:
1483 SET_FIELDS;
1484 return "Out of memory";
1485
1486 ends_prematurely:
1487 SET_FIELDS;
1488 return "Regular expression ends prematurely";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001489
Guido van Rossum004c1e11997-05-09 02:35:58 +00001490 too_complex:
1491 SET_FIELDS;
1492 return "Regular expression too complex";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001493}
Guido van Rossum004c1e11997-05-09 02:35:58 +00001494
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001495#undef CHARAT
1496#undef NEXTCHAR
1497#undef GETHEX
1498#undef ALLOC
1499#undef STORE
1500#undef CURRENT_LEVEL_START
1501#undef SET_LEVEL_START
1502#undef PUSH_LEVEL_STARTS
1503#undef POP_LEVEL_STARTS
1504#undef PUT_ADDR
1505#undef INSERT_JUMP
1506#undef SETBIT
1507#undef SET_FIELDS
1508
Guido van Rossum004c1e11997-05-09 02:35:58 +00001509#define PREFETCH if (text == textend) goto fail
1510
1511#define NEXTCHAR(var) \
1512PREFETCH; \
1513var = (unsigned char)*text++; \
1514if (translate) \
1515 var = translate[var]
1516
1517int re_match(regexp_t bufp,
1518 char *string,
1519 int size,
1520 int pos,
1521 regexp_registers_t old_regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001522{
Guido van Rossum004c1e11997-05-09 02:35:58 +00001523 char *code;
1524 char *translate;
1525 char *text;
1526 char *textstart;
1527 char *textend;
1528 int a;
1529 int b;
1530 int ch;
1531 int reg;
1532 int match_end;
1533 char *regstart;
1534 char *regend;
1535 int regsize;
1536 match_state state;
1537
1538 assert(pos >= 0 && size >= 0);
1539 assert(pos <= size);
1540
1541 text = string + pos;
1542 textstart = string;
1543 textend = string + size;
1544
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001545 code = bufp->buffer;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001546
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001547 translate = bufp->translate;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001548/* translated = NULL; */
1549/* if (bufp->translate) */
1550/* { */
1551/* char *t1; */
1552/* char *t2; */
1553
1554/* translated = malloc(size); */
1555/* if (translated == NULL) */
1556/* goto error; */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001557
Guido van Rossum004c1e11997-05-09 02:35:58 +00001558/* t1 = string; */
1559/* t2 = translated; */
1560/* while(t1 < textend) */
1561/* *t2++ = bufp->translate[*t1++]; */
1562
1563/* text = translated + pos; */
1564/* textstart = translated; */
1565/* textend = translated + size; */
1566/* } */
1567
1568 NEW_STATE(state);
1569
1570 continue_matching:
1571 switch (*code++)
1572 {
1573 case Cend:
1574 {
1575 match_end = text - textstart;
1576 if (old_regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001577 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001578 old_regs->start[0] = pos;
1579 old_regs->end[0] = match_end;
1580 if (!bufp->uses_registers)
1581 {
1582 for (a = 1; a < RE_NREGS; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001583 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001584 old_regs->start[a] = -1;
1585 old_regs->end[a] = -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001586 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001587 }
1588 else
1589 {
1590 for (a = 1; a < RE_NREGS; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001591 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001592 if ((GET_REG_START(state, a) == NULL) ||
1593 (GET_REG_END(state, a) == NULL))
1594 {
1595 old_regs->start[a] = -1;
1596 old_regs->end[a] = -1;
1597 continue;
1598 }
1599 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1600 old_regs->end[a] = GET_REG_END(state, a) - textstart;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001601 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001602 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001603 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001604/* if(translated) */
1605/* free(translated); */
1606 FREE_STATE(state);
1607 return match_end - pos;
1608 }
1609 case Cbol:
1610 {
1611 if (text == textstart || text[-1] == '\n')
1612 goto continue_matching;
1613 goto fail;
1614 }
1615 case Ceol:
1616 {
1617 if (text == textend || *text == '\n')
1618 goto continue_matching;
1619 goto fail;
1620 }
1621 case Cset:
1622 {
1623 NEXTCHAR(ch);
1624 if (code[ch/8] & (1<<(ch & 7)))
1625 {
1626 code += 256/8;
1627 goto continue_matching;
1628 }
1629 goto fail;
1630 }
1631 case Cexact:
1632 {
1633 NEXTCHAR(ch);
1634 if (ch != (unsigned char)*code++)
1635 goto fail;
1636/* { */
1637/* char *p1 = code - 2; */
1638/* ch = *(code - 1); */
1639/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
1640/* while ((code == p1) && (*text != ch)) */
1641/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
1642/* if ((code == p1) && (*text == ch)) */
1643/* { */
1644/* code += 2; */
1645/* text++; */
1646/* } */
1647/* } */
1648 goto continue_matching;
1649 }
1650 case Canychar:
1651 {
1652 NEXTCHAR(ch);
1653 if (ch == '\n')
1654 goto fail;
1655 goto continue_matching;
1656 }
1657 case Cstart_memory:
1658 {
1659 reg = *code++;
1660 SET_REG_START(state, reg, text, goto error);
1661 goto continue_matching;
1662 }
1663 case Cend_memory:
1664 {
1665 reg = *code++;
1666 SET_REG_END(state, reg, text, goto error);
1667 goto continue_matching;
1668 }
1669 case Cmatch_memory:
1670 {
1671 reg = *code++;
1672 regstart = GET_REG_START(state, reg);
1673 regend = GET_REG_END(state, reg);
1674 if ((regstart == NULL) || (regend == NULL))
1675 goto fail; /* or should we just match nothing? */
1676 regsize = regend - regstart;
1677
1678 if (regsize > (textend - text))
1679 goto fail;
1680 if(translate)
1681 {
1682 for (; regstart < regend; regstart++, text++)
1683 if (translate[*regstart] != translate[*text])
1684 goto fail;
1685 }
1686 else
1687 for (; regstart < regend; regstart++, text++)
1688 if (*regstart != *text)
1689 goto fail;
1690/* if (memcmp(text, regstart, regsize) != 0)
1691 goto fail;
1692 text += regsize; */
1693 goto continue_matching;
1694 }
1695 case Cupdate_failure_jump:
1696 {
1697 UPDATE_FAILURE(state, text, goto error);
1698 /* fall to next case */
1699 }
1700 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1701 case Cstar_jump:
1702 case Cjump:
1703 {
1704 a = (unsigned char)*code++;
1705 a |= (unsigned char)*code++ << 8;
1706 code += (int)(short)a;
1707 goto continue_matching;
1708 }
1709 case Cdummy_failure_jump:
1710 {
1711 a = (unsigned char)*code++;
1712 a |= (unsigned char)*code++ << 8;
1713 a = (int)(short)a;
1714 assert(*code == Cfailure_jump);
1715 b = (unsigned char)code[1];
1716 b |= (unsigned char)code[2] << 8;
1717 PUSH_FAILURE(state, code + (int)(short)b + 3, NULL, goto error);
1718 code += a;
1719 goto continue_matching;
1720 }
1721 case Cfailure_jump:
1722 {
1723 a = (unsigned char)*code++;
1724 a |= (unsigned char)*code++ << 8;
1725 a = (int)(short)a;
1726 PUSH_FAILURE(state, code + a, text, goto error);
1727 goto continue_matching;
1728 }
1729 case Cbegbuf:
1730 {
1731 if (text == textstart)
1732 goto continue_matching;
1733 goto fail;
1734 }
1735 case Cendbuf:
1736 {
1737 if (text == textend)
1738 goto continue_matching;
1739 goto fail;
1740 }
1741 case Cwordbeg:
1742 {
1743 if (text == textend)
1744 goto fail;
1745 if (SYNTAX(*text) != Sword)
1746 goto fail;
1747 if (text == textstart)
1748 goto continue_matching;
1749 if (SYNTAX(text[-1]) != Sword)
1750 goto continue_matching;
1751 goto fail;
1752 }
1753 case Cwordend:
1754 {
1755 if (text == textstart)
1756 goto fail;
1757 if (SYNTAX(text[-1]) != Sword)
1758 goto fail;
1759 if (text == textend)
1760 goto continue_matching;
1761 if (SYNTAX(*text) == Sword)
1762 goto fail;
1763 goto continue_matching;
1764 }
1765 case Cwordbound:
1766 {
1767 /* Note: as in gnu regexp, this also matches at the beginning
1768 * and end of buffer. */
1769
1770 if (text == textstart || text == textend)
1771 goto continue_matching;
1772 if ((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword))
1773 goto continue_matching;
1774 goto fail;
1775 }
1776 case Cnotwordbound:
1777 {
1778 /* Note: as in gnu regexp, this never matches at the beginning
1779 * and end of buffer. */
1780 if (text == textstart || text == textend)
1781 goto fail;
1782 if (!((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword)))
1783 goto fail;
1784 goto continue_matching;
1785 }
1786 case Csyntaxspec:
1787 {
1788 NEXTCHAR(ch);
1789 if (SYNTAX(ch) != (unsigned char)*code++)
1790 goto fail;
1791 goto continue_matching;
1792 }
1793 case Cnotsyntaxspec:
1794 {
1795 NEXTCHAR(ch);
1796 if (SYNTAX(ch) != (unsigned char)*code++)
1797 break;
1798 goto continue_matching;
1799 }
1800 default:
1801 {
1802 abort();
1803 /*NOTREACHED*/
1804 }
1805 }
1806
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001807#if 0 /* This line is never reached --Guido */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001808 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001809#endif
Guido van Rossum004c1e11997-05-09 02:35:58 +00001810 /*
1811 *NOTREACHED
1812 */
1813
1814 fail:
1815 POP_FAILURE(state, code, text, goto done_matching, goto error);
1816 goto continue_matching;
1817
1818 done_matching:
1819/* if(translated != NULL) */
1820/* free(translated); */
1821 FREE_STATE(state);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001822 return -1;
1823
Guido van Rossum004c1e11997-05-09 02:35:58 +00001824 error:
1825/* if (translated != NULL) */
1826/* free(translated); */
1827 FREE_STATE(state);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001828 return -2;
1829}
1830
1831#undef PREFETCH
1832#undef NEXTCHAR
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001833
Guido van Rossum004c1e11997-05-09 02:35:58 +00001834int re_search(regexp_t bufp,
1835 char *string,
1836 int size,
1837 int pos,
1838 int range,
1839 regexp_registers_t regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001840{
Guido van Rossum004c1e11997-05-09 02:35:58 +00001841 char *fastmap;
1842 char *translate;
1843 char *text;
1844 char *partstart;
1845 char *partend;
1846 int dir;
1847 int ret;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001848 char anchor;
1849
Guido van Rossum004c1e11997-05-09 02:35:58 +00001850 assert(size >= 0 && pos >= 0);
1851 assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001852
1853 fastmap = bufp->fastmap;
1854 translate = bufp->translate;
1855 if (fastmap && !bufp->fastmap_accurate)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001856 re_compile_fastmap(bufp);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001857 anchor = bufp->anchor;
1858 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
Guido van Rossum004c1e11997-05-09 02:35:58 +00001859 fastmap = NULL;
1860
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001861 if (range < 0)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001862 {
1863 dir = -1;
1864 range = -range;
1865 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001866 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001867 dir = 1;
1868
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001869 if (anchor == 2)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001870 if (pos != 0)
1871 return -1;
1872 else
1873 range = 0;
1874
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001875 for (; range >= 0; range--, pos += dir)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001876 {
1877 if (fastmap)
1878 {
1879 if (dir == 1)
1880 { /* searching forwards */
1881
1882 text = string + pos;
1883 partend = string + size;
1884 partstart = text;
1885 if (translate)
1886 while (text != partend &&
1887 !fastmap[(unsigned char) translate[(unsigned char)*text]])
1888 text++;
1889 else
1890 while (text != partend && !fastmap[(unsigned char)*text])
1891 text++;
1892 pos += text - partstart;
1893 range -= text - partstart;
1894 if (pos == size && bufp->can_be_null == 0)
1895 return -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001896 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001897 else
1898 { /* searching backwards */
1899 text = string + pos;
1900 partstart = string + pos - range;
1901 partend = text;
1902 if (translate)
1903 while (text != partstart &&
1904 !fastmap[(unsigned char)
1905 translate[(unsigned char)*text]])
1906 text--;
1907 else
1908 while (text != partstart &&
1909 !fastmap[(unsigned char)*text])
1910 text--;
1911 pos -= partend - text;
1912 range -= partend - text;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001913 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001914 }
1915 if (anchor == 1)
1916 { /* anchored to begline */
Guido van Rossumfc4f5031997-05-14 18:27:51 +00001917 if (pos > 0 && (string[pos - 1] != '\n'))
Guido van Rossum004c1e11997-05-09 02:35:58 +00001918 continue;
1919 }
1920 assert(pos >= 0 && pos <= size);
1921 ret = re_match(bufp, string, size, pos, regs);
1922 if (ret >= 0)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001923 return pos;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001924 if (ret == -2)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001925 return -2;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001926 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001927 return -1;
1928}