blob: c74ece324dd2c532b025a5d4f669bed12f997a3f [file] [log] [blame]
Guido van Rossum004c1e11997-05-09 02:35:58 +00001/* regexpr.c
2 *
3 * Author: Tatu Ylonen <ylo@ngs.fi>
4 *
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
6 *
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
11 * warranty.
12 *
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
16 *
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
20 *
21 * Emacs-specific code and syntax table code is almost directly borrowed
22 * from GNU regexp.
23 *
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
28 *
29 * $Id$ */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000030
Guido van Rossum339cfa31996-08-08 19:12:37 +000031#include "config.h" /* For Win* specific redefinition of printf c.s. */
Guido van Rossum339cfa31996-08-08 19:12:37 +000032
Guido van Rossum004c1e11997-05-09 02:35:58 +000033#include "myproto.h" /* For PROTO macro --Guido */
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000034
Guido van Rossumb674c3b1992-01-19 16:32:47 +000035#include <stdio.h>
Guido van Rossum004c1e11997-05-09 02:35:58 +000036
37#ifndef NDEBUG
38#define NDEBUG 1
39#endif
40
Guido van Rossumb674c3b1992-01-19 16:32:47 +000041#include <assert.h>
42#include "regexpr.h"
43
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000044#ifdef THINK_C
45/* Think C on the Mac really needs these headers... --Guido */
46#include <stdlib.h>
47#include <string.h>
48#else
Guido van Rossum88661e81996-05-23 22:55:58 +000049#if defined(__STDC__) || defined(_MSC_VER)
Guido van Rossum9abc5391992-03-27 17:24:37 +000050/* Don't mess around, use the standard headers */
51#include <stdlib.h>
52#include <string.h>
53#else
Guido van Rossumb674c3b1992-01-19 16:32:47 +000054char *malloc();
55void free();
56char *realloc();
Guido van Rossum9abc5391992-03-27 17:24:37 +000057#endif /* __STDC__ */
58#endif /* THINK_C */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000059
Guido van Rossum004c1e11997-05-09 02:35:58 +000060/* The stack implementation is taken from an idea by Andrew Kuchling.
61 * It's a doubly linked list of arrays. The advantages of this over a
62 * simple linked list are that the number of mallocs required are
63 * reduced. It also makes it possible to statically allocate enough
64 * space so that small patterns don't ever need to call malloc.
65 *
66 * The advantages over a single array is that is periodically
67 * realloced when more space is needed is that we avoid ever copying
68 * the stack. */
69
70/* item_t is the basic stack element. Defined as a union of
71 * structures so that both registers, failure points, and counters can
72 * be pushed/popped from the stack. There's nothing built into the
73 * item to keep track of whether a certain stack item is a register, a
74 * failure point, or a counter. */
75
76typedef union item_t
77{
78 struct
79 {
80 int num;
81 int level;
82 char *start;
83 char *end;
84 } reg;
85 struct
86 {
87 int count;
88 int level;
89 int phantom;
90 char *code;
91 char *text;
92 } fail;
93 struct
94 {
95 int num;
96 int level;
97 int count;
98 } cntr;
99} item_t;
100
101#define STACK_PAGE_SIZE 256
102#define NUM_REGISTERS 256
103
104/* A 'page' of stack items. */
105
106typedef struct item_page_t
107{
108 item_t items[STACK_PAGE_SIZE];
109 struct item_page_t *prev;
110 struct item_page_t *next;
111} item_page_t;
112
113
114typedef struct match_state
115{
116 /* Structure to encapsulate the stack. */
117 struct
118 {
119 /* index into the curent page. If index == 0 and you need
120 * to pop and item, move to the previous page and set
121 * index = STACK_PAGE_SIZE - 1. Otherwise decrement index
122 * to push a page. If index == STACK_PAGE_SIZE and you
123 * need to push a page move to the next page and set index
124 * = 0. If there is no new next page, allocate a new page
125 * and link it in. Otherwise, increment index to push a
126 * page. */
127 int index;
128 item_page_t *current; /* Pointer to the current page. */
129 item_page_t first; /* First page is statically allocated. */
130 } stack;
131 char *start[NUM_REGISTERS];
132 char *end[NUM_REGISTERS];
133
134 int changed[NUM_REGISTERS];
135 /* The number of registers that have been pushed onto the stack
136 * since the last failure point. */
137 int count;
138 /* Used to control when registers need to be pushed onto the
139 * stack. */
140 int level;
141 /* The number of failure points on the stack. */
142 int point;
143} match_state;
144
145/* Discard the top 'count' stack items. */
146
147#define STACK_DISCARD(stack, count, on_error) \
148stack.index -= count; \
149while (stack.index < 0) \
150{ \
151 if (stack.current->prev == NULL) \
152 on_error; \
153 stack.current = stack.current->prev; \
154 stack.index += STACK_PAGE_SIZE; \
155}
156
157/* Store a pointer to the previous item on the stack. Used to pop an
158 * item off of the stack. */
159
160#define STACK_PREV(stack, top, on_error) \
161if (stack.index == 0) \
162{ \
163 if (stack.current->prev == NULL) \
164 on_error; \
165 stack.current = stack.current->prev; \
166 stack.index = STACK_PAGE_SIZE - 1; \
167} \
168else \
169 stack.index--; \
170top = &(stack.current->items[stack.index])
171
172/* Store a pointer to the next item on the stack. Used to push an item
173 * on to the stack. */
174
175#define STACK_NEXT(stack, top, on_error) \
176if (stack.index == STACK_PAGE_SIZE) \
177{ \
178 if (stack.current->next == NULL) \
179 { \
180 stack.current->next = malloc(sizeof(item_page_t)); \
181 if (stack.current->next == NULL) \
182 on_error; \
183 stack.current->next->prev = stack.current; \
184 stack.current->next->next = NULL; \
185 } \
186 stack.current = stack.current->next; \
187 stack.index = 0; \
188} \
189top = &(stack.current->items[stack.index++])
190
191/* Store a pointer to the item that is 'count' items back in the
192 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
193 * STACK_TOP(stack, top, on_error). */
194
195#define STACK_BACK(stack, top, count, on_error) \
196{ \
197 int index; \
198 item_page_t *current; \
199 current = stack.current; \
200 index = stack.index - (count); \
201 while (index < 0) \
202 { \
203 if (current->prev == NULL) \
204 on_error; \
205 current = current->prev; \
206 index += STACK_PAGE_SIZE; \
207 } \
208 top = &(current->items[index]); \
209}
210
211/* Store a pointer to the top item on the stack. Execute the
212 * 'on_error' code if there are no items on the stack. */
213
214#define STACK_TOP(stack, top, on_error) \
215if (stack.index == 0) \
216{ \
217 if (stack.current->prev == NULL) \
218 on_error; \
219 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
220} \
221else \
222 top = &(stack.current->items[stack.index - 1])
223
224/* Test to see if the stack is empty */
225
226#define STACK_EMPTY(stack) ((stack.index == 0) && \
227 (stack.current->prev == NULL))
228
229
230/* Initialize a state object */
231
232#define NEW_STATE(state) \
233memset(&state, 0, sizeof(match_state)); \
234state.stack.current = &state.stack.first; \
235state.level = 1
236
237/* Free any memory that might have been malloc'd */
238
239#define FREE_STATE(state) \
240while(state.stack.first.next != NULL) \
241{ \
242 state.stack.current = state.stack.first.next; \
243 state.stack.first.next = state.stack.current->next; \
244 free(state.stack.current); \
245}
246
247/* Return the start of register 'reg' */
248
249#define GET_REG_START(state, reg) (state.start[reg])
250
251/* Return the end of register 'reg' */
252
253#define GET_REG_END(state, reg) (state.end[reg])
254
255/* Set the start of register 'reg'. If the state of the register needs
256 * saving, push it on the stack. */
257
258#define SET_REG_START(state, reg, text, on_error) \
259if(state.changed[reg] < state.level) \
260{ \
261 item_t *item; \
262 STACK_NEXT(state.stack, item, on_error); \
263 item->reg.num = reg; \
264 item->reg.start = state.start[reg]; \
265 item->reg.end = state.end[reg]; \
266 item->reg.level = state.changed[reg]; \
267 state.changed[reg] = state.level; \
268 state.count++; \
269} \
270state.start[reg] = text
271
272/* Set the end of register 'reg'. If the state of the register needs
273 * saving, push it on the stack. */
274
275#define SET_REG_END(state, reg, text, on_error) \
276if(state.changed[reg] < state.level) \
277{ \
278 item_t *item; \
279 STACK_NEXT(state.stack, item, on_error); \
280 item->reg.num = reg; \
281 item->reg.start = state.start[reg]; \
282 item->reg.end = state.end[reg]; \
283 item->reg.level = state.changed[reg]; \
284 state.changed[reg] = state.level; \
285 state.count++; \
286} \
287state.end[reg] = text
288
289#define PUSH_FAILURE(state, xcode, xtext, on_error) \
290{ \
291 item_t *item; \
292 STACK_NEXT(state.stack, item, on_error); \
293 item->fail.code = xcode; \
294 item->fail.text = xtext; \
295 item->fail.count = state.count; \
296 item->fail.level = state.level; \
297 item->fail.phantom = 0; \
298 state.count = 0; \
299 state.level++; \
300 state.point++; \
301}
302
303/* Update the last failure point with a new position in the text. */
304
305/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
306/* { \ */
307/* item_t *item; \ */
308/* STACK_DISCARD(state.stack, state.count, on_error); \ */
309/* STACK_TOP(state.stack, item, on_error); \ */
310/* item->fail.text = xtext; \ */
311/* state.count = 0; \ */
312/* } */
313
314/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
315/* { \ */
316/* item_t *item; \ */
317/* STACK_BACK(state.stack, item, state.count + 1, on_error); \ */
318/* item->fail.text = xtext; \ */
319/* } */
320
321#define UPDATE_FAILURE(state, xtext, on_error) \
322{ \
323 item_t *item; \
324 STACK_BACK(state.stack, item, state.count + 1, on_error); \
325 if (!item->fail.phantom) \
326 { \
327 item_t *item2; \
328 STACK_NEXT(state.stack, item2, on_error); \
329 item2->fail.code = item->fail.code; \
330 item2->fail.text = xtext; \
331 item2->fail.count = state.count; \
332 item2->fail.level = state.level; \
333 item2->fail.phantom = 1; \
334 state.count = 0; \
335 state.level++; \
336 state.point++; \
337 } \
338 else \
339 { \
340 STACK_DISCARD(state.stack, state.count, on_error); \
341 STACK_TOP(state.stack, item, on_error); \
342 item->fail.text = xtext; \
343 state.count = 0; \
344 state.level++; \
345 } \
346}
347
348#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
349{ \
350 item_t *item; \
351 do \
352 { \
353 while(state.count > 0) \
354 { \
355 STACK_PREV(state.stack, item, on_error); \
356 state.start[item->reg.num] = item->reg.start; \
357 state.end[item->reg.num] = item->reg.end; \
358 state.changed[item->reg.num] = item->reg.level; \
359 state.count--; \
360 } \
361 STACK_PREV(state.stack, item, on_empty); \
362 xcode = item->fail.code; \
363 xtext = item->fail.text; \
364 state.count = item->fail.count; \
365 state.level = item->fail.level; \
366 state.point--; \
367 } \
368 while (item->fail.text == NULL); \
369}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000370
371enum regexp_compiled_ops /* opcodes for compiled regexp */
372{
373 Cend, /* end of pattern reached */
374 Cbol, /* beginning of line */
375 Ceol, /* end of line */
376 Cset, /* character set. Followed by 32 bytes of set. */
377 Cexact, /* followed by a byte to match */
378 Canychar, /* matches any character except newline */
379 Cstart_memory, /* set register start addr (followed by reg number) */
380 Cend_memory, /* set register end addr (followed by reg number) */
381 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
382 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
383 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
384 Cfailure_jump, /* jump to addr on failure */
385 Cupdate_failure_jump, /* update topmost failure point and jump */
386 Cdummy_failure_jump, /* push a dummy failure point and jump */
387 Cbegbuf, /* match at beginning of buffer */
388 Cendbuf, /* match at end of buffer */
389 Cwordbeg, /* match at beginning of word */
390 Cwordend, /* match at end of word */
391 Cwordbound, /* match if at word boundary */
392 Cnotwordbound, /* match if not at word boundary */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000393 Csyntaxspec, /* matches syntax code (1 byte follows) */
394 Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
395};
396
397enum regexp_syntax_op /* syntax codes for plain and quoted characters */
398{
399 Rend, /* special code for end of regexp */
400 Rnormal, /* normal character */
401 Ranychar, /* any character except newline */
402 Rquote, /* the quote character */
403 Rbol, /* match beginning of line */
404 Reol, /* match end of line */
405 Roptional, /* match preceding expression optionally */
406 Rstar, /* match preceding expr zero or more times */
407 Rplus, /* match preceding expr one or more times */
408 Ror, /* match either of alternatives */
409 Ropenpar, /* opening parenthesis */
410 Rclosepar, /* closing parenthesis */
411 Rmemory, /* match memory register */
412 Rextended_memory, /* \vnn to match registers 10-99 */
413 Ropenset, /* open set. Internal syntax hard-coded below. */
414 /* the following are gnu extensions to "normal" regexp syntax */
415 Rbegbuf, /* beginning of buffer */
416 Rendbuf, /* end of buffer */
417 Rwordchar, /* word character */
418 Rnotwordchar, /* not word character */
419 Rwordbeg, /* beginning of word */
420 Rwordend, /* end of word */
421 Rwordbound, /* word bound */
422 Rnotwordbound, /* not word bound */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000423 Rnum_ops
424};
425
426static int re_compile_initialized = 0;
427static int regexp_syntax = 0;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000428int re_syntax = 0; /* Exported copy of regexp_syntax */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000429static unsigned char regexp_plain_ops[256];
430static unsigned char regexp_quoted_ops[256];
431static unsigned char regexp_precedences[Rnum_ops];
432static int regexp_context_indep_ops;
433static int regexp_ansi_sequences;
434
435#define NUM_LEVELS 5 /* number of precedence levels in use */
436#define MAX_NESTING 100 /* max nesting level of operators */
437
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000438#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
439#define Sword 1
440
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000441static char re_syntax_table[256];
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000442
Guido van Rossum004c1e11997-05-09 02:35:58 +0000443static void re_compile_initialize(void)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000444{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000445 int a;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000446
Guido van Rossum004c1e11997-05-09 02:35:58 +0000447 static int syntax_table_inited = 0;
448
449 if (!syntax_table_inited)
450 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000451 syntax_table_inited = 1;
452 memset(re_syntax_table, 0, 256);
453 for (a = 'a'; a <= 'z'; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000454 re_syntax_table[a] = Sword;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000455 for (a = 'A'; a <= 'Z'; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000456 re_syntax_table[a] = Sword;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000457 for (a = '0'; a <= '9'; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000458 re_syntax_table[a] = Sword;
459 }
460 re_compile_initialized = 1;
461 for (a = 0; a < 256; a++)
462 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000463 regexp_plain_ops[a] = Rnormal;
464 regexp_quoted_ops[a] = Rnormal;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000465 }
466 for (a = '0'; a <= '9'; a++)
467 regexp_quoted_ops[a] = Rmemory;
468 regexp_plain_ops['\134'] = Rquote;
469 if (regexp_syntax & RE_NO_BK_PARENS)
470 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000471 regexp_plain_ops['('] = Ropenpar;
472 regexp_plain_ops[')'] = Rclosepar;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000473 }
474 else
475 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000476 regexp_quoted_ops['('] = Ropenpar;
477 regexp_quoted_ops[')'] = Rclosepar;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000478 }
479 if (regexp_syntax & RE_NO_BK_VBAR)
480 regexp_plain_ops['\174'] = Ror;
481 else
482 regexp_quoted_ops['\174'] = Ror;
483 regexp_plain_ops['*'] = Rstar;
484 if (regexp_syntax & RE_BK_PLUS_QM)
485 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000486 regexp_quoted_ops['+'] = Rplus;
487 regexp_quoted_ops['?'] = Roptional;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000488 }
489 else
490 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000491 regexp_plain_ops['+'] = Rplus;
492 regexp_plain_ops['?'] = Roptional;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000493 }
494 if (regexp_syntax & RE_NEWLINE_OR)
495 regexp_plain_ops['\n'] = Ror;
496 regexp_plain_ops['\133'] = Ropenset;
497 regexp_plain_ops['\136'] = Rbol;
498 regexp_plain_ops['$'] = Reol;
499 regexp_plain_ops['.'] = Ranychar;
500 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
501 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000502 regexp_quoted_ops['w'] = Rwordchar;
503 regexp_quoted_ops['W'] = Rnotwordchar;
504 regexp_quoted_ops['<'] = Rwordbeg;
505 regexp_quoted_ops['>'] = Rwordend;
506 regexp_quoted_ops['b'] = Rwordbound;
507 regexp_quoted_ops['B'] = Rnotwordbound;
508 regexp_quoted_ops['`'] = Rbegbuf;
509 regexp_quoted_ops['\''] = Rendbuf;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000510 }
511 if (regexp_syntax & RE_ANSI_HEX)
512 regexp_quoted_ops['v'] = Rextended_memory;
513 for (a = 0; a < Rnum_ops; a++)
514 regexp_precedences[a] = 4;
515 if (regexp_syntax & RE_TIGHT_VBAR)
516 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000517 regexp_precedences[Ror] = 3;
518 regexp_precedences[Rbol] = 2;
519 regexp_precedences[Reol] = 2;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000520 }
521 else
522 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000523 regexp_precedences[Ror] = 2;
524 regexp_precedences[Rbol] = 3;
525 regexp_precedences[Reol] = 3;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000526 }
527 regexp_precedences[Rclosepar] = 1;
528 regexp_precedences[Rend] = 0;
529 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
530 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000531}
532
Guido van Rossum004c1e11997-05-09 02:35:58 +0000533int re_set_syntax(int syntax)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000534{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000535 int ret;
536
537 ret = regexp_syntax;
538 regexp_syntax = syntax;
539 re_syntax = syntax; /* Exported copy */
540 re_compile_initialize();
541 return ret;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000542}
543
Guido van Rossum004c1e11997-05-09 02:35:58 +0000544static int hex_char_to_decimal(int ch)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000545{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000546 if (ch >= '0' && ch <= '9')
547 return ch - '0';
548 if (ch >= 'a' && ch <= 'f')
549 return ch - 'a' + 10;
550 if (ch >= 'A' && ch <= 'F')
551 return ch - 'A' + 10;
552 return 16;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000553}
554
Guido van Rossum004c1e11997-05-09 02:35:58 +0000555static void re_compile_fastmap_aux(char *code,
556 int pos,
557 char *visited,
558 char *can_be_null,
559 char *fastmap)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000560{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000561 int a;
562 int b;
563 int syntaxcode;
564
565 if (visited[pos])
566 return; /* we have already been here */
567 visited[pos] = 1;
568 for (;;)
569 switch (code[pos++])
570 {
571 case Cend:
572 {
573 *can_be_null = 1;
574 return;
575 }
576 case Cbol:
577 case Cbegbuf:
578 case Cendbuf:
579 case Cwordbeg:
580 case Cwordend:
581 case Cwordbound:
582 case Cnotwordbound:
583 {
584 break;
585 }
586 case Csyntaxspec:
587 {
588 syntaxcode = code[pos++];
589 for (a = 0; a < 256; a++)
590 if (SYNTAX(a) == syntaxcode)
591 fastmap[a] = 1;
592 return;
593 }
594 case Cnotsyntaxspec:
595 {
596 syntaxcode = code[pos++];
597 for (a = 0; a < 256; a++)
598 if (SYNTAX(a) != syntaxcode)
599 fastmap[a] = 1;
600 return;
601 }
602 case Ceol:
603 {
604 fastmap['\n'] = 1;
605 if (*can_be_null == 0)
606 *can_be_null = 2; /* can match null, but only at end of buffer*/
607 return;
608 }
609 case Cset:
610 {
611 for (a = 0; a < 256/8; a++)
612 if (code[pos + a] != 0)
613 for (b = 0; b < 8; b++)
614 if (code[pos + a] & (1 << b))
615 fastmap[(a << 3) + b] = 1;
616 pos += 256/8;
617 return;
618 }
619 case Cexact:
620 {
621 fastmap[(unsigned char)code[pos]] = 1;
622 return;
623 }
624 case Canychar:
625 {
626 for (a = 0; a < 256; a++)
627 if (a != '\n')
628 fastmap[a] = 1;
629 return;
630 }
631 case Cstart_memory:
632 case Cend_memory:
633 {
634 pos++;
635 break;
636 }
637 case Cmatch_memory:
638 {
639 for (a = 0; a < 256; a++)
640 fastmap[a] = 1;
641 *can_be_null = 1;
642 return;
643 }
644 case Cjump:
645 case Cdummy_failure_jump:
646 case Cupdate_failure_jump:
647 case Cstar_jump:
648 {
649 a = (unsigned char)code[pos++];
650 a |= (unsigned char)code[pos++] << 8;
651 pos += (int)(short)a;
652 if (visited[pos])
653 {
654 /* argh... the regexp contains empty loops. This is not
655 good, as this may cause a failure stack overflow when
656 matching. Oh well. */
657 /* this path leads nowhere; pursue other paths. */
658 return;
659 }
660 visited[pos] = 1;
661 break;
662 }
663 case Cfailure_jump:
664 {
665 a = (unsigned char)code[pos++];
666 a |= (unsigned char)code[pos++] << 8;
667 a = pos + (int)(short)a;
668 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
669 break;
670 }
671 default:
672 {
673 abort(); /* probably some opcode is missing from this switch */
674 /*NOTREACHED*/
675 }
676 }
677}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000678
Guido van Rossum004c1e11997-05-09 02:35:58 +0000679static int re_do_compile_fastmap(char *buffer,
680 int used,
681 int pos,
682 char *can_be_null,
683 char *fastmap)
684{
685 char small_visited[512], *visited;
686
687 if (used <= sizeof(small_visited))
688 visited = small_visited;
689 else
690 {
691 visited = malloc(used);
692 if (!visited)
693 return 0;
694 }
695 *can_be_null = 0;
696 memset(fastmap, 0, 256);
697 memset(visited, 0, used);
698 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
699 if (visited != small_visited)
700 free(visited);
701 return 1;
702}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000703
Guido van Rossum004c1e11997-05-09 02:35:58 +0000704void re_compile_fastmap(regexp_t bufp)
705{
706 if (!bufp->fastmap || bufp->fastmap_accurate)
707 return;
708 assert(bufp->used > 0);
709 if (!re_do_compile_fastmap(bufp->buffer,
710 bufp->used,
711 0,
712 &bufp->can_be_null,
713 bufp->fastmap))
714 return;
715 if (bufp->buffer[0] == Cbol)
716 bufp->anchor = 1; /* begline */
717 else
718 if (bufp->buffer[0] == Cbegbuf)
719 bufp->anchor = 2; /* begbuf */
720 else
721 bufp->anchor = 0; /* none */
722 bufp->fastmap_accurate = 1;
723}
724
725/*
726 * star is coded as:
727 * 1: failure_jump 2
728 * ... code for operand of star
729 * star_jump 1
730 * 2: ... code after star
731 *
732 * We change the star_jump to update_failure_jump if we can determine
733 * that it is safe to do so; otherwise we change it to an ordinary
734 * jump.
735 *
736 * plus is coded as
737 *
738 * jump 2
739 * 1: failure_jump 3
740 * 2: ... code for operand of plus
741 * star_jump 1
742 * 3: ... code after plus
743 *
744 * For star_jump considerations this is processed identically to star.
745 *
746 */
747
748static int re_optimize_star_jump(regexp_t bufp, char *code)
749{
750 char map[256];
751 char can_be_null;
752 char *p1;
753 char *p2;
754 char ch;
755 int a;
756 int b;
757
758 a = (unsigned char)*code++;
759 a |= (unsigned char)*code++ << 8;
760 a = (int)(short)a;
761
762 p1 = code + a + 3; /* skip the failure_jump */
763 assert(p1[-3] == Cfailure_jump);
764 p2 = code;
765 /* p1 points inside loop, p2 points to after loop */
766 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
767 p2 - bufp->buffer, &can_be_null, map))
768 goto make_normal_jump;
769
770 /* If we might introduce a new update point inside the
771 * loop, we can't optimize because then update_jump would
772 * update a wrong failure point. Thus we have to be
773 * quite careful here.
774 */
775
776 /* loop until we find something that consumes a character */
777 loop_p1:
778 switch (*p1++)
779 {
780 case Cbol:
781 case Ceol:
782 case Cbegbuf:
783 case Cendbuf:
784 case Cwordbeg:
785 case Cwordend:
786 case Cwordbound:
787 case Cnotwordbound:
788 {
789 goto loop_p1;
790 }
791 case Cstart_memory:
792 case Cend_memory:
793 {
794 p1++;
795 goto loop_p1;
796 }
797 case Cexact:
798 {
799 ch = (unsigned char)*p1++;
800 if (map[ch])
801 goto make_normal_jump;
802 break;
803 }
804 case Canychar:
805 {
806 for (b = 0; b < 256; b++)
807 if (b != '\n' && map[b])
808 goto make_normal_jump;
809 break;
810 }
811 case Cset:
812 {
813 for (b = 0; b < 256; b++)
814 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
815 goto make_normal_jump;
816 p1 += 256/8;
817 break;
818 }
819 default:
820 {
821 goto make_normal_jump;
822 }
823 }
824 /* now we know that we can't backtrack. */
825 while (p1 != p2 - 3)
826 {
827 switch (*p1++)
828 {
829 case Cend:
830 {
831 return 0;
832 }
833 case Cbol:
834 case Ceol:
835 case Canychar:
836 case Cbegbuf:
837 case Cendbuf:
838 case Cwordbeg:
839 case Cwordend:
840 case Cwordbound:
841 case Cnotwordbound:
842 {
843 break;
844 }
845 case Cset:
846 {
847 p1 += 256/8;
848 break;
849 }
850 case Cexact:
851 case Cstart_memory:
852 case Cend_memory:
853 case Cmatch_memory:
854 case Csyntaxspec:
855 case Cnotsyntaxspec:
856 {
857 p1++;
858 break;
859 }
860 case Cjump:
861 case Cstar_jump:
862 case Cfailure_jump:
863 case Cupdate_failure_jump:
864 case Cdummy_failure_jump:
865 {
866 goto make_normal_jump;
867 }
868 default:
869 {
870 return 0;
871 break;
872 }
873 }
874 }
875
876 make_update_jump:
877 code -= 3;
878 a += 3; /* jump to after the Cfailure_jump */
879 code[0] = Cupdate_failure_jump;
880 code[1] = a & 0xff;
881 code[2] = a >> 8;
882 return 1;
883
884 make_normal_jump:
885 code -= 3;
886 *code = Cjump;
887 return 1;
888}
889
890static int re_optimize(regexp_t bufp)
891{
892 char *code;
893
894 code = bufp->buffer;
895
896 while(1)
897 {
898 switch (*code++)
899 {
900 case Cend:
901 {
902 return 1;
903 }
904 case Canychar:
905 case Cbol:
906 case Ceol:
907 case Cbegbuf:
908 case Cendbuf:
909 case Cwordbeg:
910 case Cwordend:
911 case Cwordbound:
912 case Cnotwordbound:
913 {
914 break;
915 }
916 case Cset:
917 {
918 code += 256/8;
919 break;
920 }
921 case Cexact:
922 case Cstart_memory:
923 case Cend_memory:
924 case Cmatch_memory:
925 case Csyntaxspec:
926 case Cnotsyntaxspec:
927 {
928 code++;
929 break;
930 }
931 case Cstar_jump:
932 {
933 if (!re_optimize_star_jump(bufp, code))
934 {
935 return 0;
936 }
937 /* fall through */
938 }
939 case Cupdate_failure_jump:
940 case Cjump:
941 case Cdummy_failure_jump:
942 case Cfailure_jump:
943 {
944 code += 2;
945 break;
946 }
947 default:
948 {
949 return 0;
950 }
951 }
952 }
953}
954
955#define NEXTCHAR(var) \
956{ \
957 if (pos >= size) \
958 goto ends_prematurely; \
959 (var) = regex[pos]; \
960 pos++; \
961}
962
963#define ALLOC(amount) \
964{ \
965 if (pattern_offset+(amount) > alloc) \
966 { \
967 alloc += 256 + (amount); \
968 pattern = realloc(pattern, alloc); \
969 if (!pattern) \
970 goto out_of_memory; \
971 } \
972}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000973
974#define STORE(ch) pattern[pattern_offset++] = (ch)
975
976#define CURRENT_LEVEL_START (starts[starts_base + current_level])
977
978#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
979
Guido van Rossum004c1e11997-05-09 02:35:58 +0000980#define PUSH_LEVEL_STARTS \
981 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
982 starts_base += NUM_LEVELS; \
983 else \
984 goto too_complex
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000985
986#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
987
Guido van Rossum004c1e11997-05-09 02:35:58 +0000988#define PUT_ADDR(offset,addr) \
989{ \
990 int disp = (addr) - (offset) - 2; \
991 pattern[(offset)] = disp & 0xff; \
992 pattern[(offset)+1] = (disp>>8) & 0xff; \
993}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000994
Guido van Rossum004c1e11997-05-09 02:35:58 +0000995#define INSERT_JUMP(pos,type,addr) \
996{ \
997 int a, p = (pos), t = (type), ad = (addr); \
998 for (a = pattern_offset - 1; a >= p; a--) \
999 pattern[a + 3] = pattern[a]; \
1000 pattern[p] = t; \
1001 PUT_ADDR(p+1,ad); \
1002 pattern_offset += 3; \
1003}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001004#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
1005
Guido van Rossum004c1e11997-05-09 02:35:58 +00001006#define SET_FIELDS \
1007{ \
1008 bufp->allocated = alloc; \
1009 bufp->buffer = pattern; \
1010 bufp->used = pattern_offset; \
1011}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001012
Guido van Rossum004c1e11997-05-09 02:35:58 +00001013#define GETHEX(var) \
1014{ \
1015 char gethex_ch, gethex_value; \
1016 NEXTCHAR(gethex_ch); \
1017 gethex_value = hex_char_to_decimal(gethex_ch); \
1018 if (gethex_value == 16) \
1019 goto hex_error; \
1020 NEXTCHAR(gethex_ch); \
1021 gethex_ch = hex_char_to_decimal(gethex_ch); \
1022 if (gethex_ch == 16) \
1023 goto hex_error; \
1024 (var) = gethex_value * 16 + gethex_ch; \
1025}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001026
Guido van Rossum004c1e11997-05-09 02:35:58 +00001027#define ANSI_TRANSLATE(ch) \
1028{ \
1029 switch (ch) \
1030 { \
1031 case 'a': \
1032 case 'A': \
1033 { \
1034 ch = 7; /* audible bell */ \
1035 break; \
1036 } \
1037 case 'b': \
1038 case 'B': \
1039 { \
1040 ch = 8; /* backspace */ \
1041 break; \
1042 } \
1043 case 'f': \
1044 case 'F': \
1045 { \
1046 ch = 12; /* form feed */ \
1047 break; \
1048 } \
1049 case 'n': \
1050 case 'N': \
1051 { \
1052 ch = 10; /* line feed */ \
1053 break; \
1054 } \
1055 case 'r': \
1056 case 'R': \
1057 { \
1058 ch = 13; /* carriage return */ \
1059 break; \
1060 } \
1061 case 't': \
1062 case 'T': \
1063 { \
1064 ch = 9; /* tab */ \
1065 break; \
1066 } \
1067 case 'v': \
1068 case 'V': \
1069 { \
1070 ch = 11; /* vertical tab */ \
1071 break; \
1072 } \
1073 case 'x': /* hex code */ \
1074 case 'X': \
1075 { \
1076 GETHEX(ch); \
1077 break; \
1078 } \
1079 default: \
1080 { \
1081 /* other characters passed through */ \
1082 if (translate) \
1083 ch = translate[(unsigned char)ch]; \
1084 break; \
1085 } \
1086 } \
1087}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001088
Guido van Rossum004c1e11997-05-09 02:35:58 +00001089char *re_compile_pattern(char *regex, int size, regexp_t bufp)
1090{
1091 int a;
1092 int pos;
1093 int op;
1094 int current_level;
1095 int level;
1096 int opcode;
1097 int pattern_offset, alloc;
1098 int starts[NUM_LEVELS * MAX_NESTING];
1099 int starts_base;
1100 int future_jumps[MAX_NESTING];
1101 int num_jumps;
1102 unsigned char ch;
1103 char *pattern;
1104 char *translate;
1105 int next_register;
1106 int paren_depth;
1107 int num_open_registers;
1108 int open_registers[RE_NREGS];
1109 int beginning_context;
1110
1111 if (!re_compile_initialized)
1112 re_compile_initialize();
1113 bufp->used = 0;
1114 bufp->fastmap_accurate = 0;
1115 bufp->uses_registers = 0;
1116 translate = bufp->translate;
1117 pattern = bufp->buffer;
1118 alloc = bufp->allocated;
1119 if (alloc == 0 || pattern == NULL)
1120 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001121 alloc = 256;
1122 pattern = malloc(alloc);
1123 if (!pattern)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001124 goto out_of_memory;
1125 }
1126 pattern_offset = 0;
1127 starts_base = 0;
1128 num_jumps = 0;
1129 current_level = 0;
1130 SET_LEVEL_START;
1131 num_open_registers = 0;
1132 next_register = 1;
1133 paren_depth = 0;
1134 beginning_context = 1;
1135 op = -1;
1136 /* we use Rend dummy to ensure that pending jumps are updated (due to
1137 low priority of Rend) before exiting the loop. */
1138 pos = 0;
1139 while (op != Rend)
1140 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001141 if (pos >= size)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001142 op = Rend;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001143 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001144 {
1145 NEXTCHAR(ch);
1146 if (translate)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001147 ch = translate[(unsigned char)ch];
Guido van Rossum004c1e11997-05-09 02:35:58 +00001148 op = regexp_plain_ops[(unsigned char)ch];
1149 if (op == Rquote)
1150 {
1151 NEXTCHAR(ch);
1152 op = regexp_quoted_ops[(unsigned char)ch];
1153 if (op == Rnormal && regexp_ansi_sequences)
1154 ANSI_TRANSLATE(ch);
1155 }
1156 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001157 level = regexp_precedences[op];
1158 /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
Guido van Rossum004c1e11997-05-09 02:35:58 +00001159 ch, op, level, current_level, CURRENT_LEVEL_START); */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001160 if (level > current_level)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001161 {
1162 for (current_level++; current_level < level; current_level++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001163 SET_LEVEL_START;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001164 SET_LEVEL_START;
1165 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001166 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001167 if (level < current_level)
1168 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001169 current_level = level;
1170 for (;num_jumps > 0 &&
Guido van Rossum004c1e11997-05-09 02:35:58 +00001171 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001172 num_jumps--)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001173 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
1174 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001175 switch (op)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001176 {
1177 case Rend:
1178 {
1179 break;
1180 }
1181 case Rnormal:
1182 {
1183 normal_char:
1184 opcode = Cexact;
1185 store_opcode_and_arg: /* opcode & ch must be set */
1186 SET_LEVEL_START;
1187 ALLOC(2);
1188 STORE(opcode);
1189 STORE(ch);
1190 break;
1191 }
1192 case Ranychar:
1193 {
1194 opcode = Canychar;
1195 store_opcode:
1196 SET_LEVEL_START;
1197 ALLOC(1);
1198 STORE(opcode);
1199 break;
1200 }
1201 case Rquote:
1202 {
1203 abort();
1204 /*NOTREACHED*/
1205 }
1206 case Rbol:
1207 {
1208 if (!beginning_context)
1209 if (regexp_context_indep_ops)
1210 goto op_error;
1211 else
1212 goto normal_char;
1213 opcode = Cbol;
1214 goto store_opcode;
1215 }
1216 case Reol:
1217 {
1218 if (!((pos >= size) ||
1219 ((regexp_syntax & RE_NO_BK_VBAR) ?
1220 (regex[pos] == '\174') :
1221 (pos+1 < size && regex[pos] == '\134' &&
1222 regex[pos+1] == '\174')) ||
1223 ((regexp_syntax & RE_NO_BK_PARENS)?
1224 (regex[pos] == ')'):
1225 (pos+1 < size && regex[pos] == '\134' &&
1226 regex[pos+1] == ')'))))
1227 if (regexp_context_indep_ops)
1228 goto op_error;
1229 else
1230 goto normal_char;
1231 opcode = Ceol;
1232 goto store_opcode;
1233 /* NOTREACHED */
1234 break;
1235 }
1236 case Roptional:
1237 {
1238 if (beginning_context)
1239 if (regexp_context_indep_ops)
1240 goto op_error;
1241 else
1242 goto normal_char;
1243 if (CURRENT_LEVEL_START == pattern_offset)
1244 break; /* ignore empty patterns for ? */
1245 ALLOC(3);
1246 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1247 pattern_offset + 3);
1248 break;
1249 }
1250 case Rstar:
1251 case Rplus:
1252 {
1253 if (beginning_context)
1254 if (regexp_context_indep_ops)
1255 goto op_error;
1256 else
1257 goto normal_char;
1258 if (CURRENT_LEVEL_START == pattern_offset)
1259 break; /* ignore empty patterns for + and * */
1260 ALLOC(9);
1261 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1262 pattern_offset + 6);
1263 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1264 if (op == Rplus) /* jump over initial failure_jump */
1265 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1266 CURRENT_LEVEL_START + 6);
1267 break;
1268 }
1269 case Ror:
1270 {
1271 ALLOC(6);
1272 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1273 pattern_offset + 6);
1274 if (num_jumps >= MAX_NESTING)
1275 goto too_complex;
1276 STORE(Cjump);
1277 future_jumps[num_jumps++] = pattern_offset;
1278 STORE(0);
1279 STORE(0);
1280 SET_LEVEL_START;
1281 break;
1282 }
1283 case Ropenpar:
1284 {
1285 SET_LEVEL_START;
1286 if (next_register < RE_NREGS)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001287 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001288 bufp->uses_registers = 1;
1289 ALLOC(2);
1290 STORE(Cstart_memory);
1291 STORE(next_register);
1292 open_registers[num_open_registers++] = next_register;
1293 next_register++;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001294 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001295 paren_depth++;
1296 PUSH_LEVEL_STARTS;
1297 current_level = 0;
1298 SET_LEVEL_START;
1299 break;
1300 }
1301 case Rclosepar:
1302 {
1303 if (paren_depth <= 0)
1304 goto parenthesis_error;
1305 POP_LEVEL_STARTS;
1306 current_level = regexp_precedences[Ropenpar];
1307 paren_depth--;
1308 if (paren_depth < num_open_registers)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001309 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001310 bufp->uses_registers = 1;
1311 ALLOC(2);
1312 STORE(Cend_memory);
1313 num_open_registers--;
1314 STORE(open_registers[num_open_registers]);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001315 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001316 break;
1317 }
1318 case Rmemory:
1319 {
1320 if (ch == '0')
1321 goto bad_match_register;
1322 assert(ch >= '0' && ch <= '9');
1323 bufp->uses_registers = 1;
1324 opcode = Cmatch_memory;
1325 ch -= '0';
1326 goto store_opcode_and_arg;
1327 }
1328 case Rextended_memory:
1329 {
1330 NEXTCHAR(ch);
1331 if (ch < '0' || ch > '9')
1332 goto bad_match_register;
1333 NEXTCHAR(a);
1334 if (a < '0' || a > '9')
1335 goto bad_match_register;
1336 ch = 10 * (a - '0') + ch - '0';
1337 if (ch <= 0 || ch >= RE_NREGS)
1338 goto bad_match_register;
1339 bufp->uses_registers = 1;
1340 opcode = Cmatch_memory;
1341 goto store_opcode_and_arg;
1342 }
1343 case Ropenset:
1344 {
1345 int complement;
1346 int prev;
1347 int offset;
1348 int range;
1349 int firstchar;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001350
1351 SET_LEVEL_START;
1352 ALLOC(1+256/8);
1353 STORE(Cset);
1354 offset = pattern_offset;
1355 for (a = 0; a < 256/8; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001356 STORE(0);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001357 NEXTCHAR(ch);
1358 if (translate)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001359 ch = translate[(unsigned char)ch];
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001360 if (ch == '\136')
Guido van Rossum004c1e11997-05-09 02:35:58 +00001361 {
1362 complement = 1;
1363 NEXTCHAR(ch);
1364 if (translate)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001365 ch = translate[(unsigned char)ch];
Guido van Rossum004c1e11997-05-09 02:35:58 +00001366 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001367 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001368 complement = 0;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001369 prev = -1;
1370 range = 0;
1371 firstchar = 1;
1372 while (ch != '\135' || firstchar)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001373 {
1374 firstchar = 0;
1375 if (regexp_ansi_sequences && ch == '\134')
1376 {
1377 NEXTCHAR(ch);
1378 ANSI_TRANSLATE(ch);
1379 }
1380 if (range)
1381 {
1382 for (a = prev; a <= (int)ch; a++)
1383 SETBIT(pattern, offset, a);
1384 prev = -1;
1385 range = 0;
1386 }
1387 else
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001388 if (prev != -1 && ch == '-')
Guido van Rossum004c1e11997-05-09 02:35:58 +00001389 range = 1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001390 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001391 {
1392 SETBIT(pattern, offset, ch);
1393 prev = ch;
1394 }
1395 NEXTCHAR(ch);
1396 if (translate)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001397 ch = translate[(unsigned char)ch];
Guido van Rossum004c1e11997-05-09 02:35:58 +00001398 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001399 if (range)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001400 SETBIT(pattern, offset, '-');
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001401 if (complement)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001402 {
1403 for (a = 0; a < 256/8; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001404 pattern[offset+a] ^= 0xff;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001405 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001406 break;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001407 }
1408 case Rbegbuf:
1409 {
1410 opcode = Cbegbuf;
1411 goto store_opcode;
1412 }
1413 case Rendbuf:
1414 {
1415 opcode = Cendbuf;
1416 goto store_opcode;
1417 }
1418 case Rwordchar:
1419 {
1420 opcode = Csyntaxspec;
1421 ch = Sword;
1422 goto store_opcode_and_arg;
1423 }
1424 case Rnotwordchar:
1425 {
1426 opcode = Cnotsyntaxspec;
1427 ch = Sword;
1428 goto store_opcode_and_arg;
1429 }
1430 case Rwordbeg:
1431 {
1432 opcode = Cwordbeg;
1433 goto store_opcode;
1434 }
1435 case Rwordend:
1436 {
1437 opcode = Cwordend;
1438 goto store_opcode;
1439 }
1440 case Rwordbound:
1441 {
1442 opcode = Cwordbound;
1443 goto store_opcode;
1444 }
1445 case Rnotwordbound:
1446 {
1447 opcode = Cnotwordbound;
1448 goto store_opcode;
1449 }
1450 default:
1451 {
1452 abort();
1453 }
1454 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001455 beginning_context = (op == Ropenpar || op == Ror);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001456 }
1457 if (starts_base != 0)
1458 goto parenthesis_error;
1459 assert(num_jumps == 0);
1460 ALLOC(1);
1461 STORE(Cend);
1462 SET_FIELDS;
1463 if(!re_optimize(bufp))
1464 return "Optimization error";
1465 return NULL;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001466
Guido van Rossum004c1e11997-05-09 02:35:58 +00001467 op_error:
1468 SET_FIELDS;
1469 return "Badly placed special character";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001470
Guido van Rossum004c1e11997-05-09 02:35:58 +00001471 bad_match_register:
1472 SET_FIELDS;
1473 return "Bad match register number";
1474
1475 hex_error:
1476 SET_FIELDS;
1477 return "Bad hexadecimal number";
1478
1479 parenthesis_error:
1480 SET_FIELDS;
1481 return "Badly placed parenthesis";
1482
1483 out_of_memory:
1484 SET_FIELDS;
1485 return "Out of memory";
1486
1487 ends_prematurely:
1488 SET_FIELDS;
1489 return "Regular expression ends prematurely";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001490
Guido van Rossum004c1e11997-05-09 02:35:58 +00001491 too_complex:
1492 SET_FIELDS;
1493 return "Regular expression too complex";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001494}
Guido van Rossum004c1e11997-05-09 02:35:58 +00001495
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001496#undef CHARAT
1497#undef NEXTCHAR
1498#undef GETHEX
1499#undef ALLOC
1500#undef STORE
1501#undef CURRENT_LEVEL_START
1502#undef SET_LEVEL_START
1503#undef PUSH_LEVEL_STARTS
1504#undef POP_LEVEL_STARTS
1505#undef PUT_ADDR
1506#undef INSERT_JUMP
1507#undef SETBIT
1508#undef SET_FIELDS
1509
Guido van Rossum004c1e11997-05-09 02:35:58 +00001510#define PREFETCH if (text == textend) goto fail
1511
1512#define NEXTCHAR(var) \
1513PREFETCH; \
1514var = (unsigned char)*text++; \
1515if (translate) \
1516 var = translate[var]
1517
1518int re_match(regexp_t bufp,
1519 char *string,
1520 int size,
1521 int pos,
1522 regexp_registers_t old_regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001523{
Guido van Rossum004c1e11997-05-09 02:35:58 +00001524 char *code;
1525 char *translate;
1526 char *text;
1527 char *textstart;
1528 char *textend;
1529 int a;
1530 int b;
1531 int ch;
1532 int reg;
1533 int match_end;
1534 char *regstart;
1535 char *regend;
1536 int regsize;
1537 match_state state;
1538
1539 assert(pos >= 0 && size >= 0);
1540 assert(pos <= size);
1541
1542 text = string + pos;
1543 textstart = string;
1544 textend = string + size;
1545
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001546 code = bufp->buffer;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001547
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001548 translate = bufp->translate;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001549/* translated = NULL; */
1550/* if (bufp->translate) */
1551/* { */
1552/* char *t1; */
1553/* char *t2; */
1554
1555/* translated = malloc(size); */
1556/* if (translated == NULL) */
1557/* goto error; */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001558
Guido van Rossum004c1e11997-05-09 02:35:58 +00001559/* t1 = string; */
1560/* t2 = translated; */
1561/* while(t1 < textend) */
1562/* *t2++ = bufp->translate[*t1++]; */
1563
1564/* text = translated + pos; */
1565/* textstart = translated; */
1566/* textend = translated + size; */
1567/* } */
1568
1569 NEW_STATE(state);
1570
1571 continue_matching:
1572 switch (*code++)
1573 {
1574 case Cend:
1575 {
1576 match_end = text - textstart;
1577 if (old_regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001578 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001579 old_regs->start[0] = pos;
1580 old_regs->end[0] = match_end;
1581 if (!bufp->uses_registers)
1582 {
1583 for (a = 1; a < RE_NREGS; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001584 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001585 old_regs->start[a] = -1;
1586 old_regs->end[a] = -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001587 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001588 }
1589 else
1590 {
1591 for (a = 1; a < RE_NREGS; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001592 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001593 if ((GET_REG_START(state, a) == NULL) ||
1594 (GET_REG_END(state, a) == NULL))
1595 {
1596 old_regs->start[a] = -1;
1597 old_regs->end[a] = -1;
1598 continue;
1599 }
1600 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1601 old_regs->end[a] = GET_REG_END(state, a) - textstart;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001602 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001603 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001604 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001605/* if(translated) */
1606/* free(translated); */
1607 FREE_STATE(state);
1608 return match_end - pos;
1609 }
1610 case Cbol:
1611 {
1612 if (text == textstart || text[-1] == '\n')
1613 goto continue_matching;
1614 goto fail;
1615 }
1616 case Ceol:
1617 {
1618 if (text == textend || *text == '\n')
1619 goto continue_matching;
1620 goto fail;
1621 }
1622 case Cset:
1623 {
1624 NEXTCHAR(ch);
1625 if (code[ch/8] & (1<<(ch & 7)))
1626 {
1627 code += 256/8;
1628 goto continue_matching;
1629 }
1630 goto fail;
1631 }
1632 case Cexact:
1633 {
1634 NEXTCHAR(ch);
1635 if (ch != (unsigned char)*code++)
1636 goto fail;
1637/* { */
1638/* char *p1 = code - 2; */
1639/* ch = *(code - 1); */
1640/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
1641/* while ((code == p1) && (*text != ch)) */
1642/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
1643/* if ((code == p1) && (*text == ch)) */
1644/* { */
1645/* code += 2; */
1646/* text++; */
1647/* } */
1648/* } */
1649 goto continue_matching;
1650 }
1651 case Canychar:
1652 {
1653 NEXTCHAR(ch);
1654 if (ch == '\n')
1655 goto fail;
1656 goto continue_matching;
1657 }
1658 case Cstart_memory:
1659 {
1660 reg = *code++;
1661 SET_REG_START(state, reg, text, goto error);
1662 goto continue_matching;
1663 }
1664 case Cend_memory:
1665 {
1666 reg = *code++;
1667 SET_REG_END(state, reg, text, goto error);
1668 goto continue_matching;
1669 }
1670 case Cmatch_memory:
1671 {
1672 reg = *code++;
1673 regstart = GET_REG_START(state, reg);
1674 regend = GET_REG_END(state, reg);
1675 if ((regstart == NULL) || (regend == NULL))
1676 goto fail; /* or should we just match nothing? */
1677 regsize = regend - regstart;
1678
1679 if (regsize > (textend - text))
1680 goto fail;
1681 if(translate)
1682 {
1683 for (; regstart < regend; regstart++, text++)
1684 if (translate[*regstart] != translate[*text])
1685 goto fail;
1686 }
1687 else
1688 for (; regstart < regend; regstart++, text++)
1689 if (*regstart != *text)
1690 goto fail;
1691/* if (memcmp(text, regstart, regsize) != 0)
1692 goto fail;
1693 text += regsize; */
1694 goto continue_matching;
1695 }
1696 case Cupdate_failure_jump:
1697 {
1698 UPDATE_FAILURE(state, text, goto error);
1699 /* fall to next case */
1700 }
1701 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1702 case Cstar_jump:
1703 case Cjump:
1704 {
1705 a = (unsigned char)*code++;
1706 a |= (unsigned char)*code++ << 8;
1707 code += (int)(short)a;
1708 goto continue_matching;
1709 }
1710 case Cdummy_failure_jump:
1711 {
1712 a = (unsigned char)*code++;
1713 a |= (unsigned char)*code++ << 8;
1714 a = (int)(short)a;
1715 assert(*code == Cfailure_jump);
1716 b = (unsigned char)code[1];
1717 b |= (unsigned char)code[2] << 8;
1718 PUSH_FAILURE(state, code + (int)(short)b + 3, NULL, goto error);
1719 code += a;
1720 goto continue_matching;
1721 }
1722 case Cfailure_jump:
1723 {
1724 a = (unsigned char)*code++;
1725 a |= (unsigned char)*code++ << 8;
1726 a = (int)(short)a;
1727 PUSH_FAILURE(state, code + a, text, goto error);
1728 goto continue_matching;
1729 }
1730 case Cbegbuf:
1731 {
1732 if (text == textstart)
1733 goto continue_matching;
1734 goto fail;
1735 }
1736 case Cendbuf:
1737 {
1738 if (text == textend)
1739 goto continue_matching;
1740 goto fail;
1741 }
1742 case Cwordbeg:
1743 {
1744 if (text == textend)
1745 goto fail;
1746 if (SYNTAX(*text) != Sword)
1747 goto fail;
1748 if (text == textstart)
1749 goto continue_matching;
1750 if (SYNTAX(text[-1]) != Sword)
1751 goto continue_matching;
1752 goto fail;
1753 }
1754 case Cwordend:
1755 {
1756 if (text == textstart)
1757 goto fail;
1758 if (SYNTAX(text[-1]) != Sword)
1759 goto fail;
1760 if (text == textend)
1761 goto continue_matching;
1762 if (SYNTAX(*text) == Sword)
1763 goto fail;
1764 goto continue_matching;
1765 }
1766 case Cwordbound:
1767 {
1768 /* Note: as in gnu regexp, this also matches at the beginning
1769 * and end of buffer. */
1770
1771 if (text == textstart || text == textend)
1772 goto continue_matching;
1773 if ((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword))
1774 goto continue_matching;
1775 goto fail;
1776 }
1777 case Cnotwordbound:
1778 {
1779 /* Note: as in gnu regexp, this never matches at the beginning
1780 * and end of buffer. */
1781 if (text == textstart || text == textend)
1782 goto fail;
1783 if (!((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword)))
1784 goto fail;
1785 goto continue_matching;
1786 }
1787 case Csyntaxspec:
1788 {
1789 NEXTCHAR(ch);
1790 if (SYNTAX(ch) != (unsigned char)*code++)
1791 goto fail;
1792 goto continue_matching;
1793 }
1794 case Cnotsyntaxspec:
1795 {
1796 NEXTCHAR(ch);
1797 if (SYNTAX(ch) != (unsigned char)*code++)
1798 break;
1799 goto continue_matching;
1800 }
1801 default:
1802 {
1803 abort();
1804 /*NOTREACHED*/
1805 }
1806 }
1807
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001808#if 0 /* This line is never reached --Guido */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001809 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001810#endif
Guido van Rossum004c1e11997-05-09 02:35:58 +00001811 /*
1812 *NOTREACHED
1813 */
1814
1815 fail:
1816 POP_FAILURE(state, code, text, goto done_matching, goto error);
1817 goto continue_matching;
1818
1819 done_matching:
1820/* if(translated != NULL) */
1821/* free(translated); */
1822 FREE_STATE(state);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001823 return -1;
1824
Guido van Rossum004c1e11997-05-09 02:35:58 +00001825 error:
1826/* if (translated != NULL) */
1827/* free(translated); */
1828 FREE_STATE(state);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001829 return -2;
1830}
1831
1832#undef PREFETCH
1833#undef NEXTCHAR
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001834
Guido van Rossum004c1e11997-05-09 02:35:58 +00001835int re_search(regexp_t bufp,
1836 char *string,
1837 int size,
1838 int pos,
1839 int range,
1840 regexp_registers_t regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001841{
Guido van Rossum004c1e11997-05-09 02:35:58 +00001842 char *fastmap;
1843 char *translate;
1844 char *text;
1845 char *partstart;
1846 char *partend;
1847 int dir;
1848 int ret;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001849 char anchor;
1850
Guido van Rossum004c1e11997-05-09 02:35:58 +00001851 assert(size >= 0 && pos >= 0);
1852 assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001853
1854 fastmap = bufp->fastmap;
1855 translate = bufp->translate;
1856 if (fastmap && !bufp->fastmap_accurate)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001857 re_compile_fastmap(bufp);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001858 anchor = bufp->anchor;
1859 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
Guido van Rossum004c1e11997-05-09 02:35:58 +00001860 fastmap = NULL;
1861
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001862 if (range < 0)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001863 {
1864 dir = -1;
1865 range = -range;
1866 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001867 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001868 dir = 1;
1869
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001870 if (anchor == 2)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001871 if (pos != 0)
1872 return -1;
1873 else
1874 range = 0;
1875
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001876 for (; range >= 0; range--, pos += dir)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001877 {
1878 if (fastmap)
1879 {
1880 if (dir == 1)
1881 { /* searching forwards */
1882
1883 text = string + pos;
1884 partend = string + size;
1885 partstart = text;
1886 if (translate)
1887 while (text != partend &&
1888 !fastmap[(unsigned char) translate[(unsigned char)*text]])
1889 text++;
1890 else
1891 while (text != partend && !fastmap[(unsigned char)*text])
1892 text++;
1893 pos += text - partstart;
1894 range -= text - partstart;
1895 if (pos == size && bufp->can_be_null == 0)
1896 return -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001897 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001898 else
1899 { /* searching backwards */
1900 text = string + pos;
1901 partstart = string + pos - range;
1902 partend = text;
1903 if (translate)
1904 while (text != partstart &&
1905 !fastmap[(unsigned char)
1906 translate[(unsigned char)*text]])
1907 text--;
1908 else
1909 while (text != partstart &&
1910 !fastmap[(unsigned char)*text])
1911 text--;
1912 pos -= partend - text;
1913 range -= partend - text;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001914 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001915 }
1916 if (anchor == 1)
1917 { /* anchored to begline */
1918 if (pos > 0 && string[pos - 1])
1919 continue;
1920 }
1921 assert(pos >= 0 && pos <= size);
1922 ret = re_match(bufp, string, size, pos, regs);
1923 if (ret >= 0)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001924 return pos;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001925 if (ret == -2)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001926 return -2;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001927 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001928 return -1;
1929}