blob: 84f0b9489e160fe04f8a291be208e6035587411d [file] [log] [blame]
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001/*
2
3regexpr.c
4
5Author: Tatu Ylonen <ylo@ngs.fi>
6
7Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
8
9Permission to use, copy, modify, distribute, and sell this software
10and its documentation for any purpose is hereby granted without fee,
11provided that the above copyright notice appear in all copies. This
12software is provided "as is" without express or implied warranty.
13
14Created: Thu Sep 26 17:14:05 1991 ylo
15Last modified: Mon Nov 4 17:06:48 1991 ylo
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000016Ported to Think C: 19 Jan 1992 guido@cwi.nl
Guido van Rossumb674c3b1992-01-19 16:32:47 +000017
18This code draws many ideas from the regular expression packages by
19Henry Spencer of the University of Toronto and Richard Stallman of the
20Free Software Foundation.
21
22Emacs-specific code and syntax table code is almost directly borrowed
23from GNU regexp.
24
Guido van Rossumb674c3b1992-01-19 16:32:47 +000025*/
26
Guido van Rossum339cfa31996-08-08 19:12:37 +000027#ifdef HAVE_CONFIG_H
28#include "config.h" /* For Win* specific redefinition of printf c.s. */
29#endif
30
Guido van Rossumb6775db1994-08-01 11:34:53 +000031#include "myproto.h" /* For PROTO macro --Guido */
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000032
Guido van Rossumb674c3b1992-01-19 16:32:47 +000033#include <stdio.h>
34#include <assert.h>
35#include "regexpr.h"
36
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000037#ifdef THINK_C
38/* Think C on the Mac really needs these headers... --Guido */
39#include <stdlib.h>
40#include <string.h>
41#else
Guido van Rossum88661e81996-05-23 22:55:58 +000042#if defined(__STDC__) || defined(_MSC_VER)
Guido van Rossum9abc5391992-03-27 17:24:37 +000043/* Don't mess around, use the standard headers */
44#include <stdlib.h>
45#include <string.h>
46#else
Guido van Rossumb674c3b1992-01-19 16:32:47 +000047char *malloc();
48void free();
49char *realloc();
Guido van Rossum9abc5391992-03-27 17:24:37 +000050#endif /* __STDC__ */
51#endif /* THINK_C */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000052
53#define MACRO_BEGIN do {
54#define MACRO_END } while (0)
55
56enum regexp_compiled_ops /* opcodes for compiled regexp */
57{
58 Cend, /* end of pattern reached */
59 Cbol, /* beginning of line */
60 Ceol, /* end of line */
61 Cset, /* character set. Followed by 32 bytes of set. */
62 Cexact, /* followed by a byte to match */
63 Canychar, /* matches any character except newline */
64 Cstart_memory, /* set register start addr (followed by reg number) */
65 Cend_memory, /* set register end addr (followed by reg number) */
66 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
67 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
68 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
69 Cfailure_jump, /* jump to addr on failure */
70 Cupdate_failure_jump, /* update topmost failure point and jump */
71 Cdummy_failure_jump, /* push a dummy failure point and jump */
72 Cbegbuf, /* match at beginning of buffer */
73 Cendbuf, /* match at end of buffer */
74 Cwordbeg, /* match at beginning of word */
75 Cwordend, /* match at end of word */
76 Cwordbound, /* match if at word boundary */
77 Cnotwordbound, /* match if not at word boundary */
78#ifdef emacs
79 Cemacs_at_dot, /* emacs only: matches at dot */
80#endif /* emacs */
81 Csyntaxspec, /* matches syntax code (1 byte follows) */
82 Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
83};
84
85enum regexp_syntax_op /* syntax codes for plain and quoted characters */
86{
87 Rend, /* special code for end of regexp */
88 Rnormal, /* normal character */
89 Ranychar, /* any character except newline */
90 Rquote, /* the quote character */
91 Rbol, /* match beginning of line */
92 Reol, /* match end of line */
93 Roptional, /* match preceding expression optionally */
94 Rstar, /* match preceding expr zero or more times */
95 Rplus, /* match preceding expr one or more times */
96 Ror, /* match either of alternatives */
97 Ropenpar, /* opening parenthesis */
98 Rclosepar, /* closing parenthesis */
99 Rmemory, /* match memory register */
100 Rextended_memory, /* \vnn to match registers 10-99 */
101 Ropenset, /* open set. Internal syntax hard-coded below. */
102 /* the following are gnu extensions to "normal" regexp syntax */
103 Rbegbuf, /* beginning of buffer */
104 Rendbuf, /* end of buffer */
105 Rwordchar, /* word character */
106 Rnotwordchar, /* not word character */
107 Rwordbeg, /* beginning of word */
108 Rwordend, /* end of word */
109 Rwordbound, /* word bound */
110 Rnotwordbound, /* not word bound */
111#ifdef emacs
112 Remacs_at_dot, /* emacs: at dot */
113 Remacs_syntaxspec, /* syntaxspec */
114 Remacs_notsyntaxspec, /* notsyntaxspec */
115#endif /* emacs */
116 Rnum_ops
117};
118
119static int re_compile_initialized = 0;
120static int regexp_syntax = 0;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000121int re_syntax = 0; /* Exported copy of regexp_syntax */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000122static unsigned char regexp_plain_ops[256];
123static unsigned char regexp_quoted_ops[256];
124static unsigned char regexp_precedences[Rnum_ops];
125static int regexp_context_indep_ops;
126static int regexp_ansi_sequences;
127
128#define NUM_LEVELS 5 /* number of precedence levels in use */
129#define MAX_NESTING 100 /* max nesting level of operators */
130
131#ifdef emacs
132
133/* This code is for emacs compatibility only. */
134
135#include "config.h"
136#include "lisp.h"
137#include "buffer.h"
138#include "syntax.h"
139
140/* emacs defines NULL in some strange way? */
141#undef NULL
142#define NULL 0
143
144#else /* emacs */
145
146#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
147#define Sword 1
148
149#ifdef SYNTAX_TABLE
150char *re_syntax_table;
151#else
152static char re_syntax_table[256];
153#endif /* SYNTAX_TABLE */
154
155#endif /* emacs */
156
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000157static void re_compile_initialize PROTO((void));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000158static void re_compile_initialize()
159{
160 int a;
161
162#if !defined(emacs) && !defined(SYNTAX_TABLE)
163 static int syntax_table_inited = 0;
164
165 if (!syntax_table_inited)
166 {
167 syntax_table_inited = 1;
168 memset(re_syntax_table, 0, 256);
169 for (a = 'a'; a <= 'z'; a++)
170 re_syntax_table[a] = Sword;
171 for (a = 'A'; a <= 'Z'; a++)
172 re_syntax_table[a] = Sword;
173 for (a = '0'; a <= '9'; a++)
174 re_syntax_table[a] = Sword;
175 }
176#endif /* !emacs && !SYNTAX_TABLE */
177 re_compile_initialized = 1;
178 for (a = 0; a < 256; a++)
179 {
180 regexp_plain_ops[a] = Rnormal;
181 regexp_quoted_ops[a] = Rnormal;
182 }
183 for (a = '0'; a <= '9'; a++)
184 regexp_quoted_ops[a] = Rmemory;
185 regexp_plain_ops['\134'] = Rquote;
186 if (regexp_syntax & RE_NO_BK_PARENS)
187 {
188 regexp_plain_ops['('] = Ropenpar;
189 regexp_plain_ops[')'] = Rclosepar;
190 }
191 else
192 {
193 regexp_quoted_ops['('] = Ropenpar;
194 regexp_quoted_ops[')'] = Rclosepar;
195 }
196 if (regexp_syntax & RE_NO_BK_VBAR)
197 regexp_plain_ops['\174'] = Ror;
198 else
199 regexp_quoted_ops['\174'] = Ror;
200 regexp_plain_ops['*'] = Rstar;
201 if (regexp_syntax & RE_BK_PLUS_QM)
202 {
203 regexp_quoted_ops['+'] = Rplus;
204 regexp_quoted_ops['?'] = Roptional;
205 }
206 else
207 {
208 regexp_plain_ops['+'] = Rplus;
209 regexp_plain_ops['?'] = Roptional;
210 }
211 if (regexp_syntax & RE_NEWLINE_OR)
212 regexp_plain_ops['\n'] = Ror;
213 regexp_plain_ops['\133'] = Ropenset;
214 regexp_plain_ops['\136'] = Rbol;
215 regexp_plain_ops['$'] = Reol;
216 regexp_plain_ops['.'] = Ranychar;
217 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
218 {
219#ifdef emacs
220 regexp_quoted_ops['='] = Remacs_at_dot;
221 regexp_quoted_ops['s'] = Remacs_syntaxspec;
222 regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
223#endif /* emacs */
224 regexp_quoted_ops['w'] = Rwordchar;
225 regexp_quoted_ops['W'] = Rnotwordchar;
226 regexp_quoted_ops['<'] = Rwordbeg;
227 regexp_quoted_ops['>'] = Rwordend;
228 regexp_quoted_ops['b'] = Rwordbound;
229 regexp_quoted_ops['B'] = Rnotwordbound;
230 regexp_quoted_ops['`'] = Rbegbuf;
231 regexp_quoted_ops['\''] = Rendbuf;
232 }
233 if (regexp_syntax & RE_ANSI_HEX)
234 regexp_quoted_ops['v'] = Rextended_memory;
235 for (a = 0; a < Rnum_ops; a++)
236 regexp_precedences[a] = 4;
237 if (regexp_syntax & RE_TIGHT_VBAR)
238 {
239 regexp_precedences[Ror] = 3;
240 regexp_precedences[Rbol] = 2;
241 regexp_precedences[Reol] = 2;
242 }
243 else
244 {
245 regexp_precedences[Ror] = 2;
246 regexp_precedences[Rbol] = 3;
247 regexp_precedences[Reol] = 3;
248 }
249 regexp_precedences[Rclosepar] = 1;
250 regexp_precedences[Rend] = 0;
251 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
252 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
253}
254
255int re_set_syntax(syntax)
256int syntax;
257{
258 int ret;
259
260 ret = regexp_syntax;
261 regexp_syntax = syntax;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000262 re_syntax = syntax; /* Exported copy */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000263 re_compile_initialize();
264 return ret;
265}
266
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000267static int hex_char_to_decimal PROTO((int));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000268static int hex_char_to_decimal(ch)
269int ch;
270{
271 if (ch >= '0' && ch <= '9')
272 return ch - '0';
273 if (ch >= 'a' && ch <= 'f')
274 return ch - 'a' + 10;
275 if (ch >= 'A' && ch <= 'F')
276 return ch - 'A' + 10;
277 return 16;
278}
279
280char *re_compile_pattern(regex, size, bufp)
281char *regex;
282int size;
283regexp_t bufp;
284{
285 int a, pos, op, current_level, level, opcode;
286 int pattern_offset, alloc;
287 int starts[NUM_LEVELS * MAX_NESTING], starts_base;
288 int future_jumps[MAX_NESTING], num_jumps;
289 unsigned char ch;
290 char *pattern, *translate;
291 int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
292 int beginning_context;
293
294#define NEXTCHAR(var) \
295 MACRO_BEGIN \
296 if (pos >= size) \
297 goto ends_prematurely; \
298 (var) = regex[pos]; \
299 pos++; \
300 MACRO_END
301
302#define ALLOC(amount) \
303 MACRO_BEGIN \
304 if (pattern_offset+(amount) > alloc) \
305 { \
306 alloc += 256 + (amount); \
307 pattern = realloc(pattern, alloc); \
308 if (!pattern) \
309 goto out_of_memory; \
310 } \
311 MACRO_END
312
313#define STORE(ch) pattern[pattern_offset++] = (ch)
314
315#define CURRENT_LEVEL_START (starts[starts_base + current_level])
316
317#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
318
319#define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
320 starts_base += NUM_LEVELS; \
321 else \
322 goto too_complex
323
324#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
325
326#define PUT_ADDR(offset,addr) \
327 MACRO_BEGIN \
328 int disp = (addr) - (offset) - 2; \
329 pattern[(offset)] = disp & 0xff; \
330 pattern[(offset)+1] = (disp>>8) & 0xff; \
331 MACRO_END
332
333#define INSERT_JUMP(pos,type,addr) \
334 MACRO_BEGIN \
335 int a, p = (pos), t = (type), ad = (addr); \
336 for (a = pattern_offset - 1; a >= p; a--) \
337 pattern[a + 3] = pattern[a]; \
338 pattern[p] = t; \
339 PUT_ADDR(p+1,ad); \
340 pattern_offset += 3; \
341 MACRO_END
342
343#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
344
345#define SET_FIELDS \
346 MACRO_BEGIN \
347 bufp->allocated = alloc; \
348 bufp->buffer = pattern; \
349 bufp->used = pattern_offset; \
350 MACRO_END
351
352#define GETHEX(var) \
353 MACRO_BEGIN \
354 char gethex_ch, gethex_value; \
355 NEXTCHAR(gethex_ch); \
356 gethex_value = hex_char_to_decimal(gethex_ch); \
357 if (gethex_value == 16) \
358 goto hex_error; \
359 NEXTCHAR(gethex_ch); \
360 gethex_ch = hex_char_to_decimal(gethex_ch); \
361 if (gethex_ch == 16) \
362 goto hex_error; \
363 (var) = gethex_value * 16 + gethex_ch; \
364 MACRO_END
365
366#define ANSI_TRANSLATE(ch) \
367 MACRO_BEGIN \
368 switch (ch) \
369 { \
370 case 'a': \
371 case 'A': \
372 ch = 7; /* audible bell */ \
373 break; \
374 case 'b': \
375 case 'B': \
376 ch = 8; /* backspace */ \
377 break; \
378 case 'f': \
379 case 'F': \
380 ch = 12; /* form feed */ \
381 break; \
382 case 'n': \
383 case 'N': \
384 ch = 10; /* line feed */ \
385 break; \
386 case 'r': \
387 case 'R': \
388 ch = 13; /* carriage return */ \
389 break; \
390 case 't': \
391 case 'T': \
392 ch = 9; /* tab */ \
393 break; \
394 case 'v': \
395 case 'V': \
396 ch = 11; /* vertical tab */ \
397 break; \
398 case 'x': /* hex code */ \
399 case 'X': \
400 GETHEX(ch); \
401 break; \
402 default: \
403 /* other characters passed through */ \
404 if (translate) \
405 ch = translate[(unsigned char)ch]; \
406 break; \
407 } \
408 MACRO_END
409
410 if (!re_compile_initialized)
411 re_compile_initialize();
412 bufp->used = 0;
413 bufp->fastmap_accurate = 0;
414 bufp->uses_registers = 0;
415 translate = bufp->translate;
416 pattern = bufp->buffer;
417 alloc = bufp->allocated;
418 if (alloc == 0 || pattern == NULL)
419 {
420 alloc = 256;
421 pattern = malloc(alloc);
422 if (!pattern)
423 goto out_of_memory;
424 }
425 pattern_offset = 0;
426 starts_base = 0;
427 num_jumps = 0;
428 current_level = 0;
429 SET_LEVEL_START;
430 num_open_registers = 0;
431 next_register = 1;
432 paren_depth = 0;
433 beginning_context = 1;
434 op = -1;
435 /* we use Rend dummy to ensure that pending jumps are updated (due to
436 low priority of Rend) before exiting the loop. */
437 pos = 0;
438 while (op != Rend)
439 {
440 if (pos >= size)
441 op = Rend;
442 else
443 {
444 NEXTCHAR(ch);
445 if (translate)
446 ch = translate[(unsigned char)ch];
447 op = regexp_plain_ops[(unsigned char)ch];
448 if (op == Rquote)
449 {
450 NEXTCHAR(ch);
451 op = regexp_quoted_ops[(unsigned char)ch];
452 if (op == Rnormal && regexp_ansi_sequences)
453 ANSI_TRANSLATE(ch);
454 }
455 }
456 level = regexp_precedences[op];
457 /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
458 ch, op, level, current_level, CURRENT_LEVEL_START); */
459 if (level > current_level)
460 {
461 for (current_level++; current_level < level; current_level++)
462 SET_LEVEL_START;
463 SET_LEVEL_START;
464 }
465 else
466 if (level < current_level)
467 {
468 current_level = level;
469 for (;num_jumps > 0 &&
470 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
471 num_jumps--)
472 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
473 }
474 switch (op)
475 {
476 case Rend:
477 break;
478 case Rnormal:
479 normal_char:
480 opcode = Cexact;
481 store_opcode_and_arg: /* opcode & ch must be set */
482 SET_LEVEL_START;
483 ALLOC(2);
484 STORE(opcode);
485 STORE(ch);
486 break;
487 case Ranychar:
488 opcode = Canychar;
489 store_opcode:
490 SET_LEVEL_START;
491 ALLOC(1);
492 STORE(opcode);
493 break;
494 case Rquote:
495 abort();
496 /*NOTREACHED*/
497 case Rbol:
498 if (!beginning_context)
499 if (regexp_context_indep_ops)
500 goto op_error;
501 else
502 goto normal_char;
503 opcode = Cbol;
504 goto store_opcode;
505 case Reol:
506 if (!((pos >= size) ||
507 ((regexp_syntax & RE_NO_BK_VBAR) ?
508 (regex[pos] == '\174') :
509 (pos+1 < size && regex[pos] == '\134' &&
510 regex[pos+1] == '\174')) ||
511 ((regexp_syntax & RE_NO_BK_PARENS)?
512 (regex[pos] == ')'):
513 (pos+1 < size && regex[pos] == '\134' &&
514 regex[pos+1] == ')'))))
515 if (regexp_context_indep_ops)
516 goto op_error;
517 else
518 goto normal_char;
519 opcode = Ceol;
520 goto store_opcode;
Guido van Rossum9abc5391992-03-27 17:24:37 +0000521 /* NOTREACHED */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000522 break;
523 case Roptional:
524 if (beginning_context)
525 if (regexp_context_indep_ops)
526 goto op_error;
527 else
528 goto normal_char;
529 if (CURRENT_LEVEL_START == pattern_offset)
530 break; /* ignore empty patterns for ? */
531 ALLOC(3);
532 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
533 pattern_offset + 3);
534 break;
535 case Rstar:
536 case Rplus:
537 if (beginning_context)
538 if (regexp_context_indep_ops)
539 goto op_error;
540 else
541 goto normal_char;
542 if (CURRENT_LEVEL_START == pattern_offset)
543 break; /* ignore empty patterns for + and * */
544 ALLOC(9);
545 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
546 pattern_offset + 6);
547 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
548 if (op == Rplus) /* jump over initial failure_jump */
549 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
550 CURRENT_LEVEL_START + 6);
551 break;
552 case Ror:
553 ALLOC(6);
554 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
555 pattern_offset + 6);
556 if (num_jumps >= MAX_NESTING)
557 goto too_complex;
558 STORE(Cjump);
559 future_jumps[num_jumps++] = pattern_offset;
560 STORE(0);
561 STORE(0);
562 SET_LEVEL_START;
563 break;
564 case Ropenpar:
565 SET_LEVEL_START;
566 if (next_register < RE_NREGS)
567 {
568 bufp->uses_registers = 1;
569 ALLOC(2);
570 STORE(Cstart_memory);
571 STORE(next_register);
572 open_registers[num_open_registers++] = next_register;
573 next_register++;
574 }
575 paren_depth++;
576 PUSH_LEVEL_STARTS;
577 current_level = 0;
578 SET_LEVEL_START;
579 break;
580 case Rclosepar:
581 if (paren_depth <= 0)
582 goto parenthesis_error;
583 POP_LEVEL_STARTS;
584 current_level = regexp_precedences[Ropenpar];
585 paren_depth--;
586 if (paren_depth < num_open_registers)
587 {
588 bufp->uses_registers = 1;
589 ALLOC(2);
590 STORE(Cend_memory);
591 num_open_registers--;
592 STORE(open_registers[num_open_registers]);
593 }
594 break;
595 case Rmemory:
596 if (ch == '0')
597 goto bad_match_register;
598 assert(ch >= '0' && ch <= '9');
599 bufp->uses_registers = 1;
600 opcode = Cmatch_memory;
601 ch -= '0';
602 goto store_opcode_and_arg;
603 case Rextended_memory:
604 NEXTCHAR(ch);
605 if (ch < '0' || ch > '9')
606 goto bad_match_register;
607 NEXTCHAR(a);
608 if (a < '0' || a > '9')
609 goto bad_match_register;
610 ch = 10 * (a - '0') + ch - '0';
611 if (ch <= 0 || ch >= RE_NREGS)
612 goto bad_match_register;
613 bufp->uses_registers = 1;
614 opcode = Cmatch_memory;
615 goto store_opcode_and_arg;
616 case Ropenset:
617 {
618 int complement,prev,offset,range,firstchar;
619
620 SET_LEVEL_START;
621 ALLOC(1+256/8);
622 STORE(Cset);
623 offset = pattern_offset;
624 for (a = 0; a < 256/8; a++)
625 STORE(0);
626 NEXTCHAR(ch);
627 if (translate)
628 ch = translate[(unsigned char)ch];
629 if (ch == '\136')
630 {
631 complement = 1;
632 NEXTCHAR(ch);
633 if (translate)
634 ch = translate[(unsigned char)ch];
635 }
636 else
637 complement = 0;
638 prev = -1;
639 range = 0;
640 firstchar = 1;
641 while (ch != '\135' || firstchar)
642 {
643 firstchar = 0;
644 if (regexp_ansi_sequences && ch == '\134')
645 {
646 NEXTCHAR(ch);
647 ANSI_TRANSLATE(ch);
648 }
649 if (range)
650 {
Guido van Rossumb6775db1994-08-01 11:34:53 +0000651 for (a = prev; a <= (int)ch; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000652 SETBIT(pattern, offset, a);
653 prev = -1;
654 range = 0;
655 }
656 else
657 if (prev != -1 && ch == '-')
658 range = 1;
659 else
660 {
661 SETBIT(pattern, offset, ch);
662 prev = ch;
663 }
664 NEXTCHAR(ch);
665 if (translate)
666 ch = translate[(unsigned char)ch];
667 }
668 if (range)
669 SETBIT(pattern, offset, '-');
670 if (complement)
671 {
672 for (a = 0; a < 256/8; a++)
673 pattern[offset+a] ^= 0xff;
674 }
675 break;
676 }
677 case Rbegbuf:
678 opcode = Cbegbuf;
679 goto store_opcode;
680 case Rendbuf:
681 opcode = Cendbuf;
682 goto store_opcode;
683 case Rwordchar:
684 opcode = Csyntaxspec;
685 ch = Sword;
686 goto store_opcode_and_arg;
687 case Rnotwordchar:
688 opcode = Cnotsyntaxspec;
689 ch = Sword;
690 goto store_opcode_and_arg;
691 case Rwordbeg:
692 opcode = Cwordbeg;
693 goto store_opcode;
694 case Rwordend:
695 opcode = Cwordend;
696 goto store_opcode;
697 case Rwordbound:
698 opcode = Cwordbound;
699 goto store_opcode;
700 case Rnotwordbound:
701 opcode = Cnotwordbound;
702 goto store_opcode;
703#ifdef emacs
704 case Remacs_at_dot:
705 opcode = Cemacs_at_dot;
706 goto store_opcode;
707 case Remacs_syntaxspec:
708 NEXTCHAR(ch);
709 if (translate)
710 ch = translate[(unsigned char)ch];
711 opcode = Csyntaxspec;
712 ch = syntax_spec_code[(unsigned char)ch];
713 goto store_opcode_and_arg;
714 case Remacs_notsyntaxspec:
715 NEXTCHAR(ch);
716 if (translate)
717 ch = translate[(unsigned char)ch];
718 opcode = Cnotsyntaxspec;
719 ch = syntax_spec_code[(unsigned char)ch];
720 goto store_opcode_and_arg;
721#endif /* emacs */
722 default:
723 abort();
724 }
725 beginning_context = (op == Ropenpar || op == Ror);
726 }
727 if (starts_base != 0)
728 goto parenthesis_error;
729 assert(num_jumps == 0);
730 ALLOC(1);
731 STORE(Cend);
732 SET_FIELDS;
733 return NULL;
734
735 op_error:
736 SET_FIELDS;
737 return "Badly placed special character";
738
739 bad_match_register:
740 SET_FIELDS;
741 return "Bad match register number";
742
743 hex_error:
744 SET_FIELDS;
745 return "Bad hexadecimal number";
746
747 parenthesis_error:
748 SET_FIELDS;
749 return "Badly placed parenthesis";
750
751 out_of_memory:
752 SET_FIELDS;
753 return "Out of memory";
754
755 ends_prematurely:
756 SET_FIELDS;
757 return "Regular expression ends prematurely";
758
759 too_complex:
760 SET_FIELDS;
761 return "Regular expression too complex";
762}
763#undef CHARAT
764#undef NEXTCHAR
765#undef GETHEX
766#undef ALLOC
767#undef STORE
768#undef CURRENT_LEVEL_START
769#undef SET_LEVEL_START
770#undef PUSH_LEVEL_STARTS
771#undef POP_LEVEL_STARTS
772#undef PUT_ADDR
773#undef INSERT_JUMP
774#undef SETBIT
775#undef SET_FIELDS
776
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000777static void re_compile_fastmap_aux
778 PROTO((char *, int, char *, char *, char *));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000779static void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
780char *code, *visited, *can_be_null, *fastmap;
781int pos;
782{
783 int a, b, syntaxcode;
784
785 if (visited[pos])
786 return; /* we have already been here */
787 visited[pos] = 1;
788 for (;;)
789 switch (code[pos++])
790 {
791 case Cend:
792 *can_be_null = 1;
793 return;
794 case Cbol:
795 case Cbegbuf:
796 case Cendbuf:
797 case Cwordbeg:
798 case Cwordend:
799 case Cwordbound:
800 case Cnotwordbound:
801#ifdef emacs
802 case Cemacs_at_dot:
803#endif /* emacs */
804 break;
805 case Csyntaxspec:
806 syntaxcode = code[pos++];
807 for (a = 0; a < 256; a++)
808 if (SYNTAX(a) == syntaxcode)
809 fastmap[a] = 1;
810 return;
811 case Cnotsyntaxspec:
812 syntaxcode = code[pos++];
813 for (a = 0; a < 256; a++)
814 if (SYNTAX(a) != syntaxcode)
815 fastmap[a] = 1;
816 return;
817 case Ceol:
818 fastmap['\n'] = 1;
819 if (*can_be_null == 0)
820 *can_be_null = 2; /* can match null, but only at end of buffer*/
821 return;
822 case Cset:
823 for (a = 0; a < 256/8; a++)
824 if (code[pos + a] != 0)
825 for (b = 0; b < 8; b++)
826 if (code[pos + a] & (1 << b))
827 fastmap[(a << 3) + b] = 1;
828 pos += 256/8;
829 return;
830 case Cexact:
831 fastmap[(unsigned char)code[pos]] = 1;
832 return;
833 case Canychar:
834 for (a = 0; a < 256; a++)
835 if (a != '\n')
836 fastmap[a] = 1;
837 return;
838 case Cstart_memory:
839 case Cend_memory:
840 pos++;
841 break;
842 case Cmatch_memory:
843 /* should this ever happen for sensible patterns??? */
844 *can_be_null = 1;
845 return;
846 case Cjump:
847 case Cdummy_failure_jump:
848 case Cupdate_failure_jump:
849 case Cstar_jump:
850 a = (unsigned char)code[pos++];
851 a |= (unsigned char)code[pos++] << 8;
852 pos += (int)(short)a;
853 if (visited[pos])
854 {
855 /* argh... the regexp contains empty loops. This is not
856 good, as this may cause a failure stack overflow when
857 matching. Oh well. */
858 /* this path leads nowhere; pursue other paths. */
859 return;
860 }
861 visited[pos] = 1;
862 break;
863 case Cfailure_jump:
864 a = (unsigned char)code[pos++];
865 a |= (unsigned char)code[pos++] << 8;
866 a = pos + (int)(short)a;
867 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
868 break;
869 default:
870 abort(); /* probably some opcode is missing from this switch */
871 /*NOTREACHED*/
872 }
873}
874
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000875static int re_do_compile_fastmap PROTO((char *, int, int, char *, char *));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000876static int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
877char *buffer, *fastmap, *can_be_null;
878int used, pos;
879{
880 char small_visited[512], *visited;
881
882 if (used <= sizeof(small_visited))
883 visited = small_visited;
884 else
885 {
886 visited = malloc(used);
887 if (!visited)
888 return 0;
889 }
890 *can_be_null = 0;
891 memset(fastmap, 0, 256);
892 memset(visited, 0, used);
893 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
894 if (visited != small_visited)
895 free(visited);
896 return 1;
897}
898
899void re_compile_fastmap(bufp)
900regexp_t bufp;
901{
902 if (!bufp->fastmap || bufp->fastmap_accurate)
903 return;
904 assert(bufp->used > 0);
905 if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
906 bufp->fastmap))
907 return;
908 if (bufp->buffer[0] == Cbol)
909 bufp->anchor = 1; /* begline */
910 else
911 if (bufp->buffer[0] == Cbegbuf)
912 bufp->anchor = 2; /* begbuf */
913 else
914 bufp->anchor = 0; /* none */
915 bufp->fastmap_accurate = 1;
916}
917
918#define INITIAL_FAILURES 128 /* initial # failure points to allocate */
Guido van Rossum88661e81996-05-23 22:55:58 +0000919#define MAX_FAILURES 4100L /* max # of failure points before failing */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000920
921int re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop)
922regexp_t bufp;
923char *string1, *string2;
924int size1, size2, pos, mstop;
925regexp_registers_t regs;
926{
927 struct failure_point { char *text, *partend, *code; }
928 *failure_stack_start, *failure_sp, *failure_stack_end,
929 initial_failure_stack[INITIAL_FAILURES];
930 char *code, *translate, *text, *textend, *partend, *part_2_end;
931 char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
932 char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
933 int a, b, ch, reg, regch, match_end;
934 char *regtext, *regpartend, *regtextend;
935
936#define PREFETCH \
937 MACRO_BEGIN \
938 if (text == partend) \
939 { \
940 if (text == textend) \
941 goto fail; \
942 text = string2; \
943 partend = part_2_end; \
944 } \
945 MACRO_END
946
947#define NEXTCHAR(var) \
948 MACRO_BEGIN \
949 PREFETCH; \
950 (var) = (unsigned char)*text++; \
951 if (translate) \
952 (var) = (unsigned char)translate[(var)]; \
953 MACRO_END
954
955 assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
956 assert(mstop <= size1 + size2);
957 assert(pos <= mstop);
958
959 if (pos <= size1)
960 {
961 text = string1 + pos;
962 if (mstop <= size1)
963 {
964 partend = string1 + mstop;
965 textend = partend;
966 }
967 else
968 {
969 partend = string1 + size1;
970 textend = string2 + mstop - size1;
971 }
972 part_2_end = string2 + mstop - size1;
973 }
974 else
975 {
976 text = string2 + pos - size1;
977 partend = string2 + mstop - size1;
978 textend = partend;
979 part_2_end = partend;
980 }
981
982 if (bufp->uses_registers && regs != NULL)
983 for (a = 0; a < RE_NREGS; a++)
984 regend_text[a] = NULL;
985
986 code = bufp->buffer;
987 translate = bufp->translate;
988 failure_stack_start = failure_sp = initial_failure_stack;
989 failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
990
991#if 0
992 /* re_search_2 has already done this, and otherwise we get little benefit
993 from this. So I'll leave this out. */
994 if (bufp->fastmap_accurate && !bufp->can_be_null &&
995 text != textend &&
996 !bufp->fastmap[translate ?
997 (unsigned char)translate[(unsigned char)*text] :
998 (unsigned char)*text])
999 return -1; /* it can't possibly match */
1000#endif
1001
1002 continue_matching:
1003 for (;;)
1004 {
1005 switch (*code++)
1006 {
1007 case Cend:
1008 if (partend != part_2_end)
1009 match_end = text - string1;
1010 else
1011 match_end = text - string2 + size1;
1012 if (regs)
1013 {
1014 regs->start[0] = pos;
1015 regs->end[0] = match_end;
1016 if (!bufp->uses_registers)
1017 {
1018 for (a = 1; a < RE_NREGS; a++)
1019 {
1020 regs->start[a] = -1;
1021 regs->end[a] = -1;
1022 }
1023 }
1024 else
1025 {
1026 for (a = 1; a < RE_NREGS; a++)
1027 {
1028 if (regend_text[a] == NULL)
1029 {
1030 regs->start[a] = -1;
1031 regs->end[a] = -1;
1032 continue;
1033 }
1034 if (regstart_partend[a] != part_2_end)
1035 regs->start[a] = regstart_text[a] - string1;
1036 else
1037 regs->start[a] = regstart_text[a] - string2 + size1;
1038 if (regend_partend[a] != part_2_end)
1039 regs->end[a] = regend_text[a] - string1;
1040 else
1041 regs->end[a] = regend_text[a] - string2 + size1;
1042 }
1043 }
1044 }
1045 if (failure_stack_start != initial_failure_stack)
1046 free((char *)failure_stack_start);
1047 return match_end - pos;
1048 case Cbol:
1049 if (text == string1 || text[-1] == '\n') /* text[-1] always valid */
1050 break;
1051 goto fail;
1052 case Ceol:
1053 if (text == string2 + size2 ||
1054 (text == string1 + size1 ?
1055 (size2 == 0 || *string2 == '\n') :
1056 *text == '\n'))
1057 break;
1058 goto fail;
1059 case Cset:
1060 NEXTCHAR(ch);
1061 if (code[ch/8] & (1<<(ch & 7)))
1062 {
1063 code += 256/8;
1064 break;
1065 }
1066 goto fail;
1067 case Cexact:
1068 NEXTCHAR(ch);
1069 if (ch != (unsigned char)*code++)
1070 goto fail;
1071 break;
1072 case Canychar:
1073 NEXTCHAR(ch);
1074 if (ch == '\n')
1075 goto fail;
1076 break;
1077 case Cstart_memory:
1078 reg = *code++;
1079 regstart_text[reg] = text;
1080 regstart_partend[reg] = partend;
1081 break;
1082 case Cend_memory:
1083 reg = *code++;
1084 regend_text[reg] = text;
1085 regend_partend[reg] = partend;
1086 break;
1087 case Cmatch_memory:
1088 reg = *code++;
1089 if (regend_text[reg] == NULL)
1090 goto fail; /* or should we just match nothing? */
1091 regtext = regstart_text[reg];
1092 regtextend = regend_text[reg];
1093 if (regstart_partend[reg] == regend_partend[reg])
1094 regpartend = regtextend;
1095 else
1096 regpartend = string1 + size1;
1097
1098 for (;regtext != regtextend;)
1099 {
1100 NEXTCHAR(ch);
1101 if (regtext == regpartend)
1102 regtext = string2;
1103 regch = (unsigned char)*regtext++;
1104 if (translate)
1105 regch = (unsigned char)translate[regch];
1106 if (regch != ch)
1107 goto fail;
1108 }
1109 break;
1110 case Cstar_jump:
1111 /* star is coded as:
1112 1: failure_jump 2
1113 ... code for operand of star
1114 star_jump 1
1115 2: ... code after star
1116 We change the star_jump to update_failure_jump if we can determine
1117 that it is safe to do so; otherwise we change it to an ordinary
1118 jump.
1119 plus is coded as
1120 jump 2
1121 1: failure_jump 3
1122 2: ... code for operand of plus
1123 star_jump 1
1124 3: ... code after plus
1125 For star_jump considerations this is processed identically
1126 to star. */
1127 a = (unsigned char)*code++;
1128 a |= (unsigned char)*code++ << 8;
1129 a = (int)(short)a;
1130 {
1131 char map[256], can_be_null;
1132 char *p1, *p2;
1133
1134 p1 = code + a + 3; /* skip the failure_jump */
1135 assert(p1[-3] == Cfailure_jump);
1136 p2 = code;
1137 /* p1 points inside loop, p2 points to after loop */
1138 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
1139 p2 - bufp->buffer, &can_be_null, map))
1140 goto make_normal_jump;
1141 /* If we might introduce a new update point inside the loop,
1142 we can't optimize because then update_jump would update a
1143 wrong failure point. Thus we have to be quite careful here. */
1144 loop_p1:
1145 /* loop until we find something that consumes a character */
1146 switch (*p1++)
1147 {
1148 case Cbol:
1149 case Ceol:
1150 case Cbegbuf:
1151 case Cendbuf:
1152 case Cwordbeg:
1153 case Cwordend:
1154 case Cwordbound:
1155 case Cnotwordbound:
1156#ifdef emacs
1157 case Cemacs_at_dot:
1158#endif /* emacs */
1159 goto loop_p1;
1160 case Cstart_memory:
1161 case Cend_memory:
1162 p1++;
1163 goto loop_p1;
1164 case Cexact:
1165 ch = (unsigned char)*p1++;
1166 if (map[ch])
1167 goto make_normal_jump;
1168 break;
1169 case Canychar:
1170 for (b = 0; b < 256; b++)
1171 if (b != '\n' && map[b])
1172 goto make_normal_jump;
1173 break;
1174 case Cset:
1175 for (b = 0; b < 256; b++)
1176 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
1177 goto make_normal_jump;
1178 p1 += 256/8;
1179 break;
1180 default:
1181 goto make_normal_jump;
1182 }
1183 /* now we know that we can't backtrack. */
1184 while (p1 != p2 - 3)
1185 {
1186 switch (*p1++)
1187 {
1188 case Cend:
1189 abort(); /* we certainly shouldn't get this inside loop */
1190 /*NOTREACHED*/
1191 case Cbol:
1192 case Ceol:
1193 case Canychar:
1194 case Cbegbuf:
1195 case Cendbuf:
1196 case Cwordbeg:
1197 case Cwordend:
1198 case Cwordbound:
1199 case Cnotwordbound:
1200#ifdef emacs
1201 case Cemacs_at_dot:
1202#endif /* emacs */
1203 break;
1204 case Cset:
1205 p1 += 256/8;
1206 break;
1207 case Cexact:
1208 case Cstart_memory:
1209 case Cend_memory:
1210 case Cmatch_memory:
1211 case Csyntaxspec:
1212 case Cnotsyntaxspec:
1213 p1++;
1214 break;
1215 case Cjump:
1216 case Cstar_jump:
1217 case Cfailure_jump:
1218 case Cupdate_failure_jump:
1219 case Cdummy_failure_jump:
1220 goto make_normal_jump;
1221 default:
1222 printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
1223 break;
1224 }
1225 }
1226 goto make_update_jump;
1227 }
1228 make_normal_jump:
1229 /* printf("changing to normal jump\n"); */
1230 code -= 3;
1231 *code = Cjump;
1232 break;
1233 make_update_jump:
1234 /* printf("changing to update jump\n"); */
1235 code -= 2;
1236 a += 3; /* jump to after the Cfailure_jump */
1237 code[-1] = Cupdate_failure_jump;
1238 code[0] = a & 0xff;
1239 code[1] = a >> 8;
1240 /* fall to next case */
1241 case Cupdate_failure_jump:
1242 failure_sp[-1].text = text;
1243 failure_sp[-1].partend = partend;
1244 /* fall to next case */
1245 case Cjump:
1246 a = (unsigned char)*code++;
1247 a |= (unsigned char)*code++ << 8;
1248 code += (int)(short)a;
1249 break;
1250 case Cdummy_failure_jump:
1251 case Cfailure_jump:
1252 if (failure_sp == failure_stack_end)
1253 {
1254 if (failure_stack_start != initial_failure_stack)
1255 goto error;
1256 failure_stack_start = (struct failure_point *)
1257 malloc(MAX_FAILURES * sizeof(*failure_stack_start));
Guido van Rossumc65a5251994-08-05 13:44:50 +00001258 if (failure_stack_start == NULL)
1259 {
1260 failure_stack_start = initial_failure_stack;
1261 goto error;
1262 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001263 failure_stack_end = failure_stack_start + MAX_FAILURES;
1264 memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
1265 INITIAL_FAILURES * sizeof(*failure_stack_start));
1266 failure_sp = failure_stack_start + INITIAL_FAILURES;
1267 }
1268 a = (unsigned char)*code++;
1269 a |= (unsigned char)*code++ << 8;
1270 a = (int)(short)a;
1271 if (code[-3] == Cdummy_failure_jump)
1272 { /* this is only used in plus */
1273 assert(*code == Cfailure_jump);
1274 b = (unsigned char)code[1];
1275 b |= (unsigned char)code[2] << 8;
1276 failure_sp->code = code + (int)(short)b + 3;
1277 failure_sp->text = NULL;
1278 code += a;
1279 }
1280 else
1281 {
1282 failure_sp->code = code + a;
1283 failure_sp->text = text;
1284 failure_sp->partend = partend;
1285 }
1286 failure_sp++;
1287 break;
1288 case Cbegbuf:
1289 if (text == string1)
1290 break;
1291 goto fail;
1292 case Cendbuf:
1293 if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
1294 break;
1295 goto fail;
1296 case Cwordbeg:
1297 if (text == string2 + size2)
1298 goto fail;
1299 if (size2 == 0 && text == string1 + size1)
1300 goto fail;
1301 if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
1302 goto fail;
1303 if (text == string1)
1304 break;
1305 if (SYNTAX(text[-1]) != Sword)
1306 break;
1307 goto fail;
1308 case Cwordend:
1309 if (text == string1)
1310 goto fail;
1311 if (SYNTAX(text[-1]) != Sword)
1312 goto fail;
1313 if (text == string2 + size2)
1314 break;
1315 if (size2 == 0 && text == string1 + size1)
1316 break;
1317 if (SYNTAX(*text) == Sword)
1318 goto fail;
1319 break;
1320 case Cwordbound:
1321 /* Note: as in gnu regexp, this also matches at the beginning
1322 and end of buffer. */
1323 if (text == string1 || text == string2 + size2 ||
1324 (size2 == 0 && text == string1 + size1))
1325 break;
1326 if ((SYNTAX(text[-1]) == Sword) ^
1327 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
1328 break;
1329 goto fail;
1330 case Cnotwordbound:
1331 /* Note: as in gnu regexp, this never matches at the beginning
1332 and end of buffer. */
1333 if (text == string1 || text == string2 + size2 ||
1334 (size2 == 0 && text == string1 + size1))
1335 goto fail;
1336 if (!((SYNTAX(text[-1]) == Sword) ^
1337 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
1338 goto fail;
1339 break;
1340 case Csyntaxspec:
1341 NEXTCHAR(ch);
1342 if (SYNTAX(ch) != (unsigned char)*code++)
1343 goto fail;
1344 break;
1345 case Cnotsyntaxspec:
1346 NEXTCHAR(ch);
1347 if (SYNTAX(ch) != (unsigned char)*code++)
1348 break;
1349 goto fail;
1350#ifdef emacs
1351 case Cemacs_at_dot:
1352 if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
1353 goto fail;
1354 break;
1355#endif /* emacs */
1356 default:
1357 abort();
1358 /*NOTREACHED*/
1359 }
1360 }
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001361#if 0 /* This line is never reached --Guido */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001362 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001363#endif
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001364 /*NOTREACHED*/
1365
1366 fail:
1367 if (failure_sp != failure_stack_start)
1368 {
1369 failure_sp--;
1370 text = failure_sp->text;
1371 if (text == NULL)
1372 goto fail;
1373 partend = failure_sp->partend;
1374 code = failure_sp->code;
1375 goto continue_matching;
1376 }
1377 if (failure_stack_start != initial_failure_stack)
1378 free((char *)failure_stack_start);
1379 return -1;
1380
1381 error:
1382 if (failure_stack_start != initial_failure_stack)
1383 free((char *)failure_stack_start);
1384 return -2;
1385}
1386
1387#undef PREFETCH
1388#undef NEXTCHAR
1389#undef PUSH_FAILURE
1390
1391int re_match(bufp, string, size, pos, regs)
1392regexp_t bufp;
1393char *string;
1394int size, pos;
1395regexp_registers_t regs;
1396{
1397 return re_match_2(bufp, string, size, (char *)NULL, 0, pos, regs, size);
1398}
1399
1400int re_search_2(bufp, string1, size1, string2, size2, pos, range, regs,
1401 mstop)
1402regexp_t bufp;
1403char *string1, *string2;
1404int size1, size2, pos, range, mstop;
1405regexp_registers_t regs;
1406{
1407 char *fastmap, *translate, *text, *partstart, *partend;
1408 int dir, ret;
1409 char anchor;
1410
1411 assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
Guido van Rossum521f81c1992-01-26 18:23:48 +00001412 assert(pos + range >= 0 && pos + range <= size1 + size2); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001413 assert(pos <= mstop);
1414
1415 fastmap = bufp->fastmap;
1416 translate = bufp->translate;
1417 if (fastmap && !bufp->fastmap_accurate)
1418 re_compile_fastmap(bufp);
1419 anchor = bufp->anchor;
1420 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1421 fastmap = NULL;
1422 if (range < 0)
1423 {
1424 dir = -1;
1425 range = -range;
1426 }
1427 else
1428 dir = 1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001429 if (anchor == 2)
1430 if (pos != 0)
1431 return -1;
1432 else
1433 range = 0;
1434 for (; range >= 0; range--, pos += dir)
1435 {
1436 if (fastmap)
1437 {
1438 if (dir == 1)
1439 { /* searching forwards */
1440 if (pos < size1)
1441 {
1442 text = string1 + pos;
1443 if (pos + range > size1)
1444 partend = string1 + size1;
1445 else
1446 partend = string1 + pos + range;
1447 }
1448 else
1449 {
1450 text = string2 + pos - size1;
1451 partend = string2 + pos + range - size1;
1452 }
1453 partstart = text;
1454 if (translate)
1455 while (text != partend &&
1456 !fastmap[(unsigned char)
1457 translate[(unsigned char)*text]])
1458 text++;
1459 else
1460 while (text != partend && !fastmap[(unsigned char)*text])
1461 text++;
1462 pos += text - partstart;
1463 range -= text - partstart;
1464 if (pos == size1 + size2 && bufp->can_be_null == 0)
1465 return -1;
1466 }
1467 else
1468 { /* searching backwards */
1469 if (pos <= size1)
1470 {
1471 text = string1 + pos;
1472 partstart = string1 + pos - range;
1473 }
1474 else
1475 {
1476 text = string2 + pos - size1;
1477 if (range < pos - size1)
1478 partstart = string2 + pos - size1 - range;
1479 else
1480 partstart = string2;
1481 }
1482 partend = text;
1483 if (translate)
1484 while (text != partstart &&
1485 !fastmap[(unsigned char)
1486 translate[(unsigned char)*text]])
1487 text--;
1488 else
1489 while (text != partstart &&
1490 !fastmap[(unsigned char)*text])
1491 text--;
1492 pos -= partend - text;
1493 range -= partend - text;
1494 }
1495 }
1496 if (anchor == 1)
1497 { /* anchored to begline */
1498 if (pos > 0 &&
1499 (pos <= size1 ? string1[pos - 1] :
1500 string2[pos - size1 - 1]) != '\n')
1501 continue;
1502 }
1503 assert(pos >= 0 && pos <= size1 + size2);
1504 ret = re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop);
1505 if (ret >= 0)
1506 return pos;
1507 if (ret == -2)
1508 return -2;
1509 }
1510 return -1;
1511}
1512
1513int re_search(bufp, string, size, startpos, range, regs)
1514regexp_t bufp;
1515char *string;
1516int size, startpos, range;
1517regexp_registers_t regs;
1518{
1519 return re_search_2(bufp, string, size, (char *)NULL, 0,
1520 startpos, range, regs, size);
1521}
1522
Guido van Rossum9abc5391992-03-27 17:24:37 +00001523#ifdef UNUSED
1524
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001525static struct re_pattern_buffer re_comp_buf;
1526
1527char *re_comp(s)
1528char *s;
1529{
1530 if (s == NULL)
1531 {
1532 if (!re_comp_buf.buffer)
1533 return "Out of memory";
1534 return NULL;
1535 }
1536 if (!re_comp_buf.buffer)
1537 {
1538 /* the buffer will be allocated automatically */
1539 re_comp_buf.fastmap = malloc(256);
1540 re_comp_buf.translate = NULL;
Guido van Rossumc65a5251994-08-05 13:44:50 +00001541 if (re_comp_buf.fastmap == NULL)
1542 return "Out of memory";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001543 }
1544 return re_compile_pattern(s, strlen(s), &re_comp_buf);
1545}
1546
1547int re_exec(s)
1548char *s;
1549{
1550 int len = strlen(s);
1551
1552 return re_search(&re_comp_buf, s, len, 0, len, (regexp_registers_t)NULL) >= 0;
1553}
1554
Guido van Rossum9abc5391992-03-27 17:24:37 +00001555#endif
1556
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001557#ifdef TEST_REGEXP
1558
1559int main()
1560{
1561 char buf[500];
1562 char *cp;
1563 struct re_pattern_buffer exp;
1564 struct re_registers regs;
1565 int a,pos;
1566 char fastmap[256];
1567
1568 exp.allocated = 0;
1569 exp.buffer = 0;
1570 exp.translate = NULL;
1571 exp.fastmap = fastmap;
1572
1573 /* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
1574
1575 while (1)
1576 {
1577 printf("Enter regexp:\n");
1578 gets(buf);
1579 cp=re_compile_pattern(buf, strlen(buf), &exp);
1580 if (cp)
1581 {
1582 printf("Error: %s\n", cp);
1583 continue;
1584 }
1585 re_compile_fastmap(&exp);
1586 printf("dump:\n");
1587 for (pos = 0; pos < exp.used;)
1588 {
1589 printf("%d: ", pos);
1590 switch (exp.buffer[pos++])
1591 {
1592 case Cend:
1593 strcpy(buf, "end");
1594 break;
1595 case Cbol:
1596 strcpy(buf, "bol");
1597 break;
1598 case Ceol:
1599 strcpy(buf, "eol");
1600 break;
1601 case Cset:
1602 strcpy(buf, "set ");
1603 for (a = 0; a < 256/8; a++)
1604 sprintf(buf+strlen(buf)," %02x",
1605 (unsigned char)exp.buffer[pos++]);
1606 break;
1607 case Cexact:
1608 sprintf(buf, "exact '%c' 0x%x", exp.buffer[pos],
1609 (unsigned char)exp.buffer[pos]);
1610 pos++;
1611 break;
1612 case Canychar:
1613 strcpy(buf, "anychar");
1614 break;
1615 case Cstart_memory:
1616 sprintf(buf, "start_memory %d", exp.buffer[pos++]);
1617 break;
1618 case Cend_memory:
1619 sprintf(buf, "end_memory %d", exp.buffer[pos++]);
1620 break;
1621 case Cmatch_memory:
1622 sprintf(buf, "match_memory %d", exp.buffer[pos++]);
1623 break;
1624 case Cjump:
1625 case Cdummy_failure_jump:
1626 case Cstar_jump:
1627 case Cfailure_jump:
1628 case Cupdate_failure_jump:
1629 a = (unsigned char)exp.buffer[pos++];
1630 a += (unsigned char)exp.buffer[pos++] << 8;
1631 a = (int)(short)a;
1632 switch (exp.buffer[pos-3])
1633 {
1634 case Cjump:
1635 cp = "jump";
1636 break;
1637 case Cstar_jump:
1638 cp = "star_jump";
1639 break;
1640 case Cfailure_jump:
1641 cp = "failure_jump";
1642 break;
1643 case Cupdate_failure_jump:
1644 cp = "update_failure_jump";
1645 break;
1646 case Cdummy_failure_jump:
1647 cp = "dummy_failure_jump";
1648 break;
1649 default:
1650 cp = "unknown jump";
1651 break;
1652 }
1653 sprintf(buf, "%s %d", cp, a + pos);
1654 break;
1655 case Cbegbuf:
1656 strcpy(buf,"begbuf");
1657 break;
1658 case Cendbuf:
1659 strcpy(buf,"endbuf");
1660 break;
1661 case Cwordbeg:
1662 strcpy(buf,"wordbeg");
1663 break;
1664 case Cwordend:
1665 strcpy(buf,"wordend");
1666 break;
1667 case Cwordbound:
1668 strcpy(buf,"wordbound");
1669 break;
1670 case Cnotwordbound:
1671 strcpy(buf,"notwordbound");
1672 break;
1673 default:
1674 sprintf(buf, "unknown code %d",
1675 (unsigned char)exp.buffer[pos - 1]);
1676 break;
1677 }
1678 printf("%s\n", buf);
1679 }
1680 printf("can_be_null = %d uses_registers = %d anchor = %d\n",
1681 exp.can_be_null, exp.uses_registers, exp.anchor);
1682
1683 printf("fastmap:");
1684 for (a = 0; a < 256; a++)
1685 if (exp.fastmap[a])
1686 printf(" %d", a);
1687 printf("\n");
1688 printf("Enter strings. An empty line terminates.\n");
1689 while (fgets(buf, sizeof(buf), stdin))
1690 {
1691 if (buf[0] == '\n')
1692 break;
1693 a = re_search(&exp, buf, strlen(buf), 0, strlen(buf), &regs);
1694 printf("search returns %d\n", a);
1695 if (a != -1)
1696 {
1697 for (a = 0; a < RE_NREGS; a++)
1698 {
1699 printf("buf %d: %d to %d\n", a, regs.start[a], regs.end[a]);
1700 }
1701 }
1702 }
1703 }
1704}
1705
1706#endif /* TEST_REGEXP */