blob: 841f500d095b100911cb6b42b170f212308f0cc8 [file] [log] [blame]
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001/*
2
3regexpr.c
4
5Author: Tatu Ylonen <ylo@ngs.fi>
6
7Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
8
9Permission to use, copy, modify, distribute, and sell this software
10and its documentation for any purpose is hereby granted without fee,
11provided that the above copyright notice appear in all copies. This
12software is provided "as is" without express or implied warranty.
13
14Created: Thu Sep 26 17:14:05 1991 ylo
15Last modified: Mon Nov 4 17:06:48 1991 ylo
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000016Ported to Think C: 19 Jan 1992 guido@cwi.nl
Guido van Rossumb674c3b1992-01-19 16:32:47 +000017
18This code draws many ideas from the regular expression packages by
19Henry Spencer of the University of Toronto and Richard Stallman of the
20Free Software Foundation.
21
22Emacs-specific code and syntax table code is almost directly borrowed
23from GNU regexp.
24
Guido van Rossumb674c3b1992-01-19 16:32:47 +000025*/
26
Guido van Rossumb6775db1994-08-01 11:34:53 +000027#include "myproto.h" /* For PROTO macro --Guido */
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000028
Guido van Rossumb674c3b1992-01-19 16:32:47 +000029#include <stdio.h>
30#include <assert.h>
31#include "regexpr.h"
32
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000033#ifdef THINK_C
34/* Think C on the Mac really needs these headers... --Guido */
35#include <stdlib.h>
36#include <string.h>
37#else
Guido van Rossum9abc5391992-03-27 17:24:37 +000038#ifdef __STDC__
39/* Don't mess around, use the standard headers */
40#include <stdlib.h>
41#include <string.h>
42#else
Guido van Rossumb674c3b1992-01-19 16:32:47 +000043char *malloc();
44void free();
45char *realloc();
Guido van Rossum9abc5391992-03-27 17:24:37 +000046#endif /* __STDC__ */
47#endif /* THINK_C */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000048
49#define MACRO_BEGIN do {
50#define MACRO_END } while (0)
51
52enum regexp_compiled_ops /* opcodes for compiled regexp */
53{
54 Cend, /* end of pattern reached */
55 Cbol, /* beginning of line */
56 Ceol, /* end of line */
57 Cset, /* character set. Followed by 32 bytes of set. */
58 Cexact, /* followed by a byte to match */
59 Canychar, /* matches any character except newline */
60 Cstart_memory, /* set register start addr (followed by reg number) */
61 Cend_memory, /* set register end addr (followed by reg number) */
62 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
63 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
64 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
65 Cfailure_jump, /* jump to addr on failure */
66 Cupdate_failure_jump, /* update topmost failure point and jump */
67 Cdummy_failure_jump, /* push a dummy failure point and jump */
68 Cbegbuf, /* match at beginning of buffer */
69 Cendbuf, /* match at end of buffer */
70 Cwordbeg, /* match at beginning of word */
71 Cwordend, /* match at end of word */
72 Cwordbound, /* match if at word boundary */
73 Cnotwordbound, /* match if not at word boundary */
74#ifdef emacs
75 Cemacs_at_dot, /* emacs only: matches at dot */
76#endif /* emacs */
77 Csyntaxspec, /* matches syntax code (1 byte follows) */
78 Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
79};
80
81enum regexp_syntax_op /* syntax codes for plain and quoted characters */
82{
83 Rend, /* special code for end of regexp */
84 Rnormal, /* normal character */
85 Ranychar, /* any character except newline */
86 Rquote, /* the quote character */
87 Rbol, /* match beginning of line */
88 Reol, /* match end of line */
89 Roptional, /* match preceding expression optionally */
90 Rstar, /* match preceding expr zero or more times */
91 Rplus, /* match preceding expr one or more times */
92 Ror, /* match either of alternatives */
93 Ropenpar, /* opening parenthesis */
94 Rclosepar, /* closing parenthesis */
95 Rmemory, /* match memory register */
96 Rextended_memory, /* \vnn to match registers 10-99 */
97 Ropenset, /* open set. Internal syntax hard-coded below. */
98 /* the following are gnu extensions to "normal" regexp syntax */
99 Rbegbuf, /* beginning of buffer */
100 Rendbuf, /* end of buffer */
101 Rwordchar, /* word character */
102 Rnotwordchar, /* not word character */
103 Rwordbeg, /* beginning of word */
104 Rwordend, /* end of word */
105 Rwordbound, /* word bound */
106 Rnotwordbound, /* not word bound */
107#ifdef emacs
108 Remacs_at_dot, /* emacs: at dot */
109 Remacs_syntaxspec, /* syntaxspec */
110 Remacs_notsyntaxspec, /* notsyntaxspec */
111#endif /* emacs */
112 Rnum_ops
113};
114
115static int re_compile_initialized = 0;
116static int regexp_syntax = 0;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000117int re_syntax = 0; /* Exported copy of regexp_syntax */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000118static unsigned char regexp_plain_ops[256];
119static unsigned char regexp_quoted_ops[256];
120static unsigned char regexp_precedences[Rnum_ops];
121static int regexp_context_indep_ops;
122static int regexp_ansi_sequences;
123
124#define NUM_LEVELS 5 /* number of precedence levels in use */
125#define MAX_NESTING 100 /* max nesting level of operators */
126
127#ifdef emacs
128
129/* This code is for emacs compatibility only. */
130
131#include "config.h"
132#include "lisp.h"
133#include "buffer.h"
134#include "syntax.h"
135
136/* emacs defines NULL in some strange way? */
137#undef NULL
138#define NULL 0
139
140#else /* emacs */
141
142#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
143#define Sword 1
144
145#ifdef SYNTAX_TABLE
146char *re_syntax_table;
147#else
148static char re_syntax_table[256];
149#endif /* SYNTAX_TABLE */
150
151#endif /* emacs */
152
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000153static void re_compile_initialize PROTO((void));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000154static void re_compile_initialize()
155{
156 int a;
157
158#if !defined(emacs) && !defined(SYNTAX_TABLE)
159 static int syntax_table_inited = 0;
160
161 if (!syntax_table_inited)
162 {
163 syntax_table_inited = 1;
164 memset(re_syntax_table, 0, 256);
165 for (a = 'a'; a <= 'z'; a++)
166 re_syntax_table[a] = Sword;
167 for (a = 'A'; a <= 'Z'; a++)
168 re_syntax_table[a] = Sword;
169 for (a = '0'; a <= '9'; a++)
170 re_syntax_table[a] = Sword;
171 }
172#endif /* !emacs && !SYNTAX_TABLE */
173 re_compile_initialized = 1;
174 for (a = 0; a < 256; a++)
175 {
176 regexp_plain_ops[a] = Rnormal;
177 regexp_quoted_ops[a] = Rnormal;
178 }
179 for (a = '0'; a <= '9'; a++)
180 regexp_quoted_ops[a] = Rmemory;
181 regexp_plain_ops['\134'] = Rquote;
182 if (regexp_syntax & RE_NO_BK_PARENS)
183 {
184 regexp_plain_ops['('] = Ropenpar;
185 regexp_plain_ops[')'] = Rclosepar;
186 }
187 else
188 {
189 regexp_quoted_ops['('] = Ropenpar;
190 regexp_quoted_ops[')'] = Rclosepar;
191 }
192 if (regexp_syntax & RE_NO_BK_VBAR)
193 regexp_plain_ops['\174'] = Ror;
194 else
195 regexp_quoted_ops['\174'] = Ror;
196 regexp_plain_ops['*'] = Rstar;
197 if (regexp_syntax & RE_BK_PLUS_QM)
198 {
199 regexp_quoted_ops['+'] = Rplus;
200 regexp_quoted_ops['?'] = Roptional;
201 }
202 else
203 {
204 regexp_plain_ops['+'] = Rplus;
205 regexp_plain_ops['?'] = Roptional;
206 }
207 if (regexp_syntax & RE_NEWLINE_OR)
208 regexp_plain_ops['\n'] = Ror;
209 regexp_plain_ops['\133'] = Ropenset;
210 regexp_plain_ops['\136'] = Rbol;
211 regexp_plain_ops['$'] = Reol;
212 regexp_plain_ops['.'] = Ranychar;
213 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
214 {
215#ifdef emacs
216 regexp_quoted_ops['='] = Remacs_at_dot;
217 regexp_quoted_ops['s'] = Remacs_syntaxspec;
218 regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
219#endif /* emacs */
220 regexp_quoted_ops['w'] = Rwordchar;
221 regexp_quoted_ops['W'] = Rnotwordchar;
222 regexp_quoted_ops['<'] = Rwordbeg;
223 regexp_quoted_ops['>'] = Rwordend;
224 regexp_quoted_ops['b'] = Rwordbound;
225 regexp_quoted_ops['B'] = Rnotwordbound;
226 regexp_quoted_ops['`'] = Rbegbuf;
227 regexp_quoted_ops['\''] = Rendbuf;
228 }
229 if (regexp_syntax & RE_ANSI_HEX)
230 regexp_quoted_ops['v'] = Rextended_memory;
231 for (a = 0; a < Rnum_ops; a++)
232 regexp_precedences[a] = 4;
233 if (regexp_syntax & RE_TIGHT_VBAR)
234 {
235 regexp_precedences[Ror] = 3;
236 regexp_precedences[Rbol] = 2;
237 regexp_precedences[Reol] = 2;
238 }
239 else
240 {
241 regexp_precedences[Ror] = 2;
242 regexp_precedences[Rbol] = 3;
243 regexp_precedences[Reol] = 3;
244 }
245 regexp_precedences[Rclosepar] = 1;
246 regexp_precedences[Rend] = 0;
247 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
248 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
249}
250
251int re_set_syntax(syntax)
252int syntax;
253{
254 int ret;
255
256 ret = regexp_syntax;
257 regexp_syntax = syntax;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000258 re_syntax = syntax; /* Exported copy */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000259 re_compile_initialize();
260 return ret;
261}
262
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000263static int hex_char_to_decimal PROTO((int));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000264static int hex_char_to_decimal(ch)
265int ch;
266{
267 if (ch >= '0' && ch <= '9')
268 return ch - '0';
269 if (ch >= 'a' && ch <= 'f')
270 return ch - 'a' + 10;
271 if (ch >= 'A' && ch <= 'F')
272 return ch - 'A' + 10;
273 return 16;
274}
275
276char *re_compile_pattern(regex, size, bufp)
277char *regex;
278int size;
279regexp_t bufp;
280{
281 int a, pos, op, current_level, level, opcode;
282 int pattern_offset, alloc;
283 int starts[NUM_LEVELS * MAX_NESTING], starts_base;
284 int future_jumps[MAX_NESTING], num_jumps;
285 unsigned char ch;
286 char *pattern, *translate;
287 int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
288 int beginning_context;
289
290#define NEXTCHAR(var) \
291 MACRO_BEGIN \
292 if (pos >= size) \
293 goto ends_prematurely; \
294 (var) = regex[pos]; \
295 pos++; \
296 MACRO_END
297
298#define ALLOC(amount) \
299 MACRO_BEGIN \
300 if (pattern_offset+(amount) > alloc) \
301 { \
302 alloc += 256 + (amount); \
303 pattern = realloc(pattern, alloc); \
304 if (!pattern) \
305 goto out_of_memory; \
306 } \
307 MACRO_END
308
309#define STORE(ch) pattern[pattern_offset++] = (ch)
310
311#define CURRENT_LEVEL_START (starts[starts_base + current_level])
312
313#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
314
315#define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
316 starts_base += NUM_LEVELS; \
317 else \
318 goto too_complex
319
320#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
321
322#define PUT_ADDR(offset,addr) \
323 MACRO_BEGIN \
324 int disp = (addr) - (offset) - 2; \
325 pattern[(offset)] = disp & 0xff; \
326 pattern[(offset)+1] = (disp>>8) & 0xff; \
327 MACRO_END
328
329#define INSERT_JUMP(pos,type,addr) \
330 MACRO_BEGIN \
331 int a, p = (pos), t = (type), ad = (addr); \
332 for (a = pattern_offset - 1; a >= p; a--) \
333 pattern[a + 3] = pattern[a]; \
334 pattern[p] = t; \
335 PUT_ADDR(p+1,ad); \
336 pattern_offset += 3; \
337 MACRO_END
338
339#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
340
341#define SET_FIELDS \
342 MACRO_BEGIN \
343 bufp->allocated = alloc; \
344 bufp->buffer = pattern; \
345 bufp->used = pattern_offset; \
346 MACRO_END
347
348#define GETHEX(var) \
349 MACRO_BEGIN \
350 char gethex_ch, gethex_value; \
351 NEXTCHAR(gethex_ch); \
352 gethex_value = hex_char_to_decimal(gethex_ch); \
353 if (gethex_value == 16) \
354 goto hex_error; \
355 NEXTCHAR(gethex_ch); \
356 gethex_ch = hex_char_to_decimal(gethex_ch); \
357 if (gethex_ch == 16) \
358 goto hex_error; \
359 (var) = gethex_value * 16 + gethex_ch; \
360 MACRO_END
361
362#define ANSI_TRANSLATE(ch) \
363 MACRO_BEGIN \
364 switch (ch) \
365 { \
366 case 'a': \
367 case 'A': \
368 ch = 7; /* audible bell */ \
369 break; \
370 case 'b': \
371 case 'B': \
372 ch = 8; /* backspace */ \
373 break; \
374 case 'f': \
375 case 'F': \
376 ch = 12; /* form feed */ \
377 break; \
378 case 'n': \
379 case 'N': \
380 ch = 10; /* line feed */ \
381 break; \
382 case 'r': \
383 case 'R': \
384 ch = 13; /* carriage return */ \
385 break; \
386 case 't': \
387 case 'T': \
388 ch = 9; /* tab */ \
389 break; \
390 case 'v': \
391 case 'V': \
392 ch = 11; /* vertical tab */ \
393 break; \
394 case 'x': /* hex code */ \
395 case 'X': \
396 GETHEX(ch); \
397 break; \
398 default: \
399 /* other characters passed through */ \
400 if (translate) \
401 ch = translate[(unsigned char)ch]; \
402 break; \
403 } \
404 MACRO_END
405
406 if (!re_compile_initialized)
407 re_compile_initialize();
408 bufp->used = 0;
409 bufp->fastmap_accurate = 0;
410 bufp->uses_registers = 0;
411 translate = bufp->translate;
412 pattern = bufp->buffer;
413 alloc = bufp->allocated;
414 if (alloc == 0 || pattern == NULL)
415 {
416 alloc = 256;
417 pattern = malloc(alloc);
418 if (!pattern)
419 goto out_of_memory;
420 }
421 pattern_offset = 0;
422 starts_base = 0;
423 num_jumps = 0;
424 current_level = 0;
425 SET_LEVEL_START;
426 num_open_registers = 0;
427 next_register = 1;
428 paren_depth = 0;
429 beginning_context = 1;
430 op = -1;
431 /* we use Rend dummy to ensure that pending jumps are updated (due to
432 low priority of Rend) before exiting the loop. */
433 pos = 0;
434 while (op != Rend)
435 {
436 if (pos >= size)
437 op = Rend;
438 else
439 {
440 NEXTCHAR(ch);
441 if (translate)
442 ch = translate[(unsigned char)ch];
443 op = regexp_plain_ops[(unsigned char)ch];
444 if (op == Rquote)
445 {
446 NEXTCHAR(ch);
447 op = regexp_quoted_ops[(unsigned char)ch];
448 if (op == Rnormal && regexp_ansi_sequences)
449 ANSI_TRANSLATE(ch);
450 }
451 }
452 level = regexp_precedences[op];
453 /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
454 ch, op, level, current_level, CURRENT_LEVEL_START); */
455 if (level > current_level)
456 {
457 for (current_level++; current_level < level; current_level++)
458 SET_LEVEL_START;
459 SET_LEVEL_START;
460 }
461 else
462 if (level < current_level)
463 {
464 current_level = level;
465 for (;num_jumps > 0 &&
466 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
467 num_jumps--)
468 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
469 }
470 switch (op)
471 {
472 case Rend:
473 break;
474 case Rnormal:
475 normal_char:
476 opcode = Cexact;
477 store_opcode_and_arg: /* opcode & ch must be set */
478 SET_LEVEL_START;
479 ALLOC(2);
480 STORE(opcode);
481 STORE(ch);
482 break;
483 case Ranychar:
484 opcode = Canychar;
485 store_opcode:
486 SET_LEVEL_START;
487 ALLOC(1);
488 STORE(opcode);
489 break;
490 case Rquote:
491 abort();
492 /*NOTREACHED*/
493 case Rbol:
494 if (!beginning_context)
495 if (regexp_context_indep_ops)
496 goto op_error;
497 else
498 goto normal_char;
499 opcode = Cbol;
500 goto store_opcode;
501 case Reol:
502 if (!((pos >= size) ||
503 ((regexp_syntax & RE_NO_BK_VBAR) ?
504 (regex[pos] == '\174') :
505 (pos+1 < size && regex[pos] == '\134' &&
506 regex[pos+1] == '\174')) ||
507 ((regexp_syntax & RE_NO_BK_PARENS)?
508 (regex[pos] == ')'):
509 (pos+1 < size && regex[pos] == '\134' &&
510 regex[pos+1] == ')'))))
511 if (regexp_context_indep_ops)
512 goto op_error;
513 else
514 goto normal_char;
515 opcode = Ceol;
516 goto store_opcode;
Guido van Rossum9abc5391992-03-27 17:24:37 +0000517 /* NOTREACHED */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000518 break;
519 case Roptional:
520 if (beginning_context)
521 if (regexp_context_indep_ops)
522 goto op_error;
523 else
524 goto normal_char;
525 if (CURRENT_LEVEL_START == pattern_offset)
526 break; /* ignore empty patterns for ? */
527 ALLOC(3);
528 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
529 pattern_offset + 3);
530 break;
531 case Rstar:
532 case Rplus:
533 if (beginning_context)
534 if (regexp_context_indep_ops)
535 goto op_error;
536 else
537 goto normal_char;
538 if (CURRENT_LEVEL_START == pattern_offset)
539 break; /* ignore empty patterns for + and * */
540 ALLOC(9);
541 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
542 pattern_offset + 6);
543 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
544 if (op == Rplus) /* jump over initial failure_jump */
545 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
546 CURRENT_LEVEL_START + 6);
547 break;
548 case Ror:
549 ALLOC(6);
550 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
551 pattern_offset + 6);
552 if (num_jumps >= MAX_NESTING)
553 goto too_complex;
554 STORE(Cjump);
555 future_jumps[num_jumps++] = pattern_offset;
556 STORE(0);
557 STORE(0);
558 SET_LEVEL_START;
559 break;
560 case Ropenpar:
561 SET_LEVEL_START;
562 if (next_register < RE_NREGS)
563 {
564 bufp->uses_registers = 1;
565 ALLOC(2);
566 STORE(Cstart_memory);
567 STORE(next_register);
568 open_registers[num_open_registers++] = next_register;
569 next_register++;
570 }
571 paren_depth++;
572 PUSH_LEVEL_STARTS;
573 current_level = 0;
574 SET_LEVEL_START;
575 break;
576 case Rclosepar:
577 if (paren_depth <= 0)
578 goto parenthesis_error;
579 POP_LEVEL_STARTS;
580 current_level = regexp_precedences[Ropenpar];
581 paren_depth--;
582 if (paren_depth < num_open_registers)
583 {
584 bufp->uses_registers = 1;
585 ALLOC(2);
586 STORE(Cend_memory);
587 num_open_registers--;
588 STORE(open_registers[num_open_registers]);
589 }
590 break;
591 case Rmemory:
592 if (ch == '0')
593 goto bad_match_register;
594 assert(ch >= '0' && ch <= '9');
595 bufp->uses_registers = 1;
596 opcode = Cmatch_memory;
597 ch -= '0';
598 goto store_opcode_and_arg;
599 case Rextended_memory:
600 NEXTCHAR(ch);
601 if (ch < '0' || ch > '9')
602 goto bad_match_register;
603 NEXTCHAR(a);
604 if (a < '0' || a > '9')
605 goto bad_match_register;
606 ch = 10 * (a - '0') + ch - '0';
607 if (ch <= 0 || ch >= RE_NREGS)
608 goto bad_match_register;
609 bufp->uses_registers = 1;
610 opcode = Cmatch_memory;
611 goto store_opcode_and_arg;
612 case Ropenset:
613 {
614 int complement,prev,offset,range,firstchar;
615
616 SET_LEVEL_START;
617 ALLOC(1+256/8);
618 STORE(Cset);
619 offset = pattern_offset;
620 for (a = 0; a < 256/8; a++)
621 STORE(0);
622 NEXTCHAR(ch);
623 if (translate)
624 ch = translate[(unsigned char)ch];
625 if (ch == '\136')
626 {
627 complement = 1;
628 NEXTCHAR(ch);
629 if (translate)
630 ch = translate[(unsigned char)ch];
631 }
632 else
633 complement = 0;
634 prev = -1;
635 range = 0;
636 firstchar = 1;
637 while (ch != '\135' || firstchar)
638 {
639 firstchar = 0;
640 if (regexp_ansi_sequences && ch == '\134')
641 {
642 NEXTCHAR(ch);
643 ANSI_TRANSLATE(ch);
644 }
645 if (range)
646 {
Guido van Rossumb6775db1994-08-01 11:34:53 +0000647 for (a = prev; a <= (int)ch; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000648 SETBIT(pattern, offset, a);
649 prev = -1;
650 range = 0;
651 }
652 else
653 if (prev != -1 && ch == '-')
654 range = 1;
655 else
656 {
657 SETBIT(pattern, offset, ch);
658 prev = ch;
659 }
660 NEXTCHAR(ch);
661 if (translate)
662 ch = translate[(unsigned char)ch];
663 }
664 if (range)
665 SETBIT(pattern, offset, '-');
666 if (complement)
667 {
668 for (a = 0; a < 256/8; a++)
669 pattern[offset+a] ^= 0xff;
670 }
671 break;
672 }
673 case Rbegbuf:
674 opcode = Cbegbuf;
675 goto store_opcode;
676 case Rendbuf:
677 opcode = Cendbuf;
678 goto store_opcode;
679 case Rwordchar:
680 opcode = Csyntaxspec;
681 ch = Sword;
682 goto store_opcode_and_arg;
683 case Rnotwordchar:
684 opcode = Cnotsyntaxspec;
685 ch = Sword;
686 goto store_opcode_and_arg;
687 case Rwordbeg:
688 opcode = Cwordbeg;
689 goto store_opcode;
690 case Rwordend:
691 opcode = Cwordend;
692 goto store_opcode;
693 case Rwordbound:
694 opcode = Cwordbound;
695 goto store_opcode;
696 case Rnotwordbound:
697 opcode = Cnotwordbound;
698 goto store_opcode;
699#ifdef emacs
700 case Remacs_at_dot:
701 opcode = Cemacs_at_dot;
702 goto store_opcode;
703 case Remacs_syntaxspec:
704 NEXTCHAR(ch);
705 if (translate)
706 ch = translate[(unsigned char)ch];
707 opcode = Csyntaxspec;
708 ch = syntax_spec_code[(unsigned char)ch];
709 goto store_opcode_and_arg;
710 case Remacs_notsyntaxspec:
711 NEXTCHAR(ch);
712 if (translate)
713 ch = translate[(unsigned char)ch];
714 opcode = Cnotsyntaxspec;
715 ch = syntax_spec_code[(unsigned char)ch];
716 goto store_opcode_and_arg;
717#endif /* emacs */
718 default:
719 abort();
720 }
721 beginning_context = (op == Ropenpar || op == Ror);
722 }
723 if (starts_base != 0)
724 goto parenthesis_error;
725 assert(num_jumps == 0);
726 ALLOC(1);
727 STORE(Cend);
728 SET_FIELDS;
729 return NULL;
730
731 op_error:
732 SET_FIELDS;
733 return "Badly placed special character";
734
735 bad_match_register:
736 SET_FIELDS;
737 return "Bad match register number";
738
739 hex_error:
740 SET_FIELDS;
741 return "Bad hexadecimal number";
742
743 parenthesis_error:
744 SET_FIELDS;
745 return "Badly placed parenthesis";
746
747 out_of_memory:
748 SET_FIELDS;
749 return "Out of memory";
750
751 ends_prematurely:
752 SET_FIELDS;
753 return "Regular expression ends prematurely";
754
755 too_complex:
756 SET_FIELDS;
757 return "Regular expression too complex";
758}
759#undef CHARAT
760#undef NEXTCHAR
761#undef GETHEX
762#undef ALLOC
763#undef STORE
764#undef CURRENT_LEVEL_START
765#undef SET_LEVEL_START
766#undef PUSH_LEVEL_STARTS
767#undef POP_LEVEL_STARTS
768#undef PUT_ADDR
769#undef INSERT_JUMP
770#undef SETBIT
771#undef SET_FIELDS
772
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000773static void re_compile_fastmap_aux
774 PROTO((char *, int, char *, char *, char *));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000775static void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
776char *code, *visited, *can_be_null, *fastmap;
777int pos;
778{
779 int a, b, syntaxcode;
780
781 if (visited[pos])
782 return; /* we have already been here */
783 visited[pos] = 1;
784 for (;;)
785 switch (code[pos++])
786 {
787 case Cend:
788 *can_be_null = 1;
789 return;
790 case Cbol:
791 case Cbegbuf:
792 case Cendbuf:
793 case Cwordbeg:
794 case Cwordend:
795 case Cwordbound:
796 case Cnotwordbound:
797#ifdef emacs
798 case Cemacs_at_dot:
799#endif /* emacs */
800 break;
801 case Csyntaxspec:
802 syntaxcode = code[pos++];
803 for (a = 0; a < 256; a++)
804 if (SYNTAX(a) == syntaxcode)
805 fastmap[a] = 1;
806 return;
807 case Cnotsyntaxspec:
808 syntaxcode = code[pos++];
809 for (a = 0; a < 256; a++)
810 if (SYNTAX(a) != syntaxcode)
811 fastmap[a] = 1;
812 return;
813 case Ceol:
814 fastmap['\n'] = 1;
815 if (*can_be_null == 0)
816 *can_be_null = 2; /* can match null, but only at end of buffer*/
817 return;
818 case Cset:
819 for (a = 0; a < 256/8; a++)
820 if (code[pos + a] != 0)
821 for (b = 0; b < 8; b++)
822 if (code[pos + a] & (1 << b))
823 fastmap[(a << 3) + b] = 1;
824 pos += 256/8;
825 return;
826 case Cexact:
827 fastmap[(unsigned char)code[pos]] = 1;
828 return;
829 case Canychar:
830 for (a = 0; a < 256; a++)
831 if (a != '\n')
832 fastmap[a] = 1;
833 return;
834 case Cstart_memory:
835 case Cend_memory:
836 pos++;
837 break;
838 case Cmatch_memory:
839 /* should this ever happen for sensible patterns??? */
840 *can_be_null = 1;
841 return;
842 case Cjump:
843 case Cdummy_failure_jump:
844 case Cupdate_failure_jump:
845 case Cstar_jump:
846 a = (unsigned char)code[pos++];
847 a |= (unsigned char)code[pos++] << 8;
848 pos += (int)(short)a;
849 if (visited[pos])
850 {
851 /* argh... the regexp contains empty loops. This is not
852 good, as this may cause a failure stack overflow when
853 matching. Oh well. */
854 /* this path leads nowhere; pursue other paths. */
855 return;
856 }
857 visited[pos] = 1;
858 break;
859 case Cfailure_jump:
860 a = (unsigned char)code[pos++];
861 a |= (unsigned char)code[pos++] << 8;
862 a = pos + (int)(short)a;
863 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
864 break;
865 default:
866 abort(); /* probably some opcode is missing from this switch */
867 /*NOTREACHED*/
868 }
869}
870
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000871static int re_do_compile_fastmap PROTO((char *, int, int, char *, char *));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000872static int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
873char *buffer, *fastmap, *can_be_null;
874int used, pos;
875{
876 char small_visited[512], *visited;
877
878 if (used <= sizeof(small_visited))
879 visited = small_visited;
880 else
881 {
882 visited = malloc(used);
883 if (!visited)
884 return 0;
885 }
886 *can_be_null = 0;
887 memset(fastmap, 0, 256);
888 memset(visited, 0, used);
889 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
890 if (visited != small_visited)
891 free(visited);
892 return 1;
893}
894
895void re_compile_fastmap(bufp)
896regexp_t bufp;
897{
898 if (!bufp->fastmap || bufp->fastmap_accurate)
899 return;
900 assert(bufp->used > 0);
901 if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
902 bufp->fastmap))
903 return;
904 if (bufp->buffer[0] == Cbol)
905 bufp->anchor = 1; /* begline */
906 else
907 if (bufp->buffer[0] == Cbegbuf)
908 bufp->anchor = 2; /* begbuf */
909 else
910 bufp->anchor = 0; /* none */
911 bufp->fastmap_accurate = 1;
912}
913
914#define INITIAL_FAILURES 128 /* initial # failure points to allocate */
915#define MAX_FAILURES 4100 /* max # of failure points before failing */
916
917int re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop)
918regexp_t bufp;
919char *string1, *string2;
920int size1, size2, pos, mstop;
921regexp_registers_t regs;
922{
923 struct failure_point { char *text, *partend, *code; }
924 *failure_stack_start, *failure_sp, *failure_stack_end,
925 initial_failure_stack[INITIAL_FAILURES];
926 char *code, *translate, *text, *textend, *partend, *part_2_end;
927 char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
928 char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
929 int a, b, ch, reg, regch, match_end;
930 char *regtext, *regpartend, *regtextend;
931
932#define PREFETCH \
933 MACRO_BEGIN \
934 if (text == partend) \
935 { \
936 if (text == textend) \
937 goto fail; \
938 text = string2; \
939 partend = part_2_end; \
940 } \
941 MACRO_END
942
943#define NEXTCHAR(var) \
944 MACRO_BEGIN \
945 PREFETCH; \
946 (var) = (unsigned char)*text++; \
947 if (translate) \
948 (var) = (unsigned char)translate[(var)]; \
949 MACRO_END
950
951 assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
952 assert(mstop <= size1 + size2);
953 assert(pos <= mstop);
954
955 if (pos <= size1)
956 {
957 text = string1 + pos;
958 if (mstop <= size1)
959 {
960 partend = string1 + mstop;
961 textend = partend;
962 }
963 else
964 {
965 partend = string1 + size1;
966 textend = string2 + mstop - size1;
967 }
968 part_2_end = string2 + mstop - size1;
969 }
970 else
971 {
972 text = string2 + pos - size1;
973 partend = string2 + mstop - size1;
974 textend = partend;
975 part_2_end = partend;
976 }
977
978 if (bufp->uses_registers && regs != NULL)
979 for (a = 0; a < RE_NREGS; a++)
980 regend_text[a] = NULL;
981
982 code = bufp->buffer;
983 translate = bufp->translate;
984 failure_stack_start = failure_sp = initial_failure_stack;
985 failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
986
987#if 0
988 /* re_search_2 has already done this, and otherwise we get little benefit
989 from this. So I'll leave this out. */
990 if (bufp->fastmap_accurate && !bufp->can_be_null &&
991 text != textend &&
992 !bufp->fastmap[translate ?
993 (unsigned char)translate[(unsigned char)*text] :
994 (unsigned char)*text])
995 return -1; /* it can't possibly match */
996#endif
997
998 continue_matching:
999 for (;;)
1000 {
1001 switch (*code++)
1002 {
1003 case Cend:
1004 if (partend != part_2_end)
1005 match_end = text - string1;
1006 else
1007 match_end = text - string2 + size1;
1008 if (regs)
1009 {
1010 regs->start[0] = pos;
1011 regs->end[0] = match_end;
1012 if (!bufp->uses_registers)
1013 {
1014 for (a = 1; a < RE_NREGS; a++)
1015 {
1016 regs->start[a] = -1;
1017 regs->end[a] = -1;
1018 }
1019 }
1020 else
1021 {
1022 for (a = 1; a < RE_NREGS; a++)
1023 {
1024 if (regend_text[a] == NULL)
1025 {
1026 regs->start[a] = -1;
1027 regs->end[a] = -1;
1028 continue;
1029 }
1030 if (regstart_partend[a] != part_2_end)
1031 regs->start[a] = regstart_text[a] - string1;
1032 else
1033 regs->start[a] = regstart_text[a] - string2 + size1;
1034 if (regend_partend[a] != part_2_end)
1035 regs->end[a] = regend_text[a] - string1;
1036 else
1037 regs->end[a] = regend_text[a] - string2 + size1;
1038 }
1039 }
1040 }
1041 if (failure_stack_start != initial_failure_stack)
1042 free((char *)failure_stack_start);
1043 return match_end - pos;
1044 case Cbol:
1045 if (text == string1 || text[-1] == '\n') /* text[-1] always valid */
1046 break;
1047 goto fail;
1048 case Ceol:
1049 if (text == string2 + size2 ||
1050 (text == string1 + size1 ?
1051 (size2 == 0 || *string2 == '\n') :
1052 *text == '\n'))
1053 break;
1054 goto fail;
1055 case Cset:
1056 NEXTCHAR(ch);
1057 if (code[ch/8] & (1<<(ch & 7)))
1058 {
1059 code += 256/8;
1060 break;
1061 }
1062 goto fail;
1063 case Cexact:
1064 NEXTCHAR(ch);
1065 if (ch != (unsigned char)*code++)
1066 goto fail;
1067 break;
1068 case Canychar:
1069 NEXTCHAR(ch);
1070 if (ch == '\n')
1071 goto fail;
1072 break;
1073 case Cstart_memory:
1074 reg = *code++;
1075 regstart_text[reg] = text;
1076 regstart_partend[reg] = partend;
1077 break;
1078 case Cend_memory:
1079 reg = *code++;
1080 regend_text[reg] = text;
1081 regend_partend[reg] = partend;
1082 break;
1083 case Cmatch_memory:
1084 reg = *code++;
1085 if (regend_text[reg] == NULL)
1086 goto fail; /* or should we just match nothing? */
1087 regtext = regstart_text[reg];
1088 regtextend = regend_text[reg];
1089 if (regstart_partend[reg] == regend_partend[reg])
1090 regpartend = regtextend;
1091 else
1092 regpartend = string1 + size1;
1093
1094 for (;regtext != regtextend;)
1095 {
1096 NEXTCHAR(ch);
1097 if (regtext == regpartend)
1098 regtext = string2;
1099 regch = (unsigned char)*regtext++;
1100 if (translate)
1101 regch = (unsigned char)translate[regch];
1102 if (regch != ch)
1103 goto fail;
1104 }
1105 break;
1106 case Cstar_jump:
1107 /* star is coded as:
1108 1: failure_jump 2
1109 ... code for operand of star
1110 star_jump 1
1111 2: ... code after star
1112 We change the star_jump to update_failure_jump if we can determine
1113 that it is safe to do so; otherwise we change it to an ordinary
1114 jump.
1115 plus is coded as
1116 jump 2
1117 1: failure_jump 3
1118 2: ... code for operand of plus
1119 star_jump 1
1120 3: ... code after plus
1121 For star_jump considerations this is processed identically
1122 to star. */
1123 a = (unsigned char)*code++;
1124 a |= (unsigned char)*code++ << 8;
1125 a = (int)(short)a;
1126 {
1127 char map[256], can_be_null;
1128 char *p1, *p2;
1129
1130 p1 = code + a + 3; /* skip the failure_jump */
1131 assert(p1[-3] == Cfailure_jump);
1132 p2 = code;
1133 /* p1 points inside loop, p2 points to after loop */
1134 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
1135 p2 - bufp->buffer, &can_be_null, map))
1136 goto make_normal_jump;
1137 /* If we might introduce a new update point inside the loop,
1138 we can't optimize because then update_jump would update a
1139 wrong failure point. Thus we have to be quite careful here. */
1140 loop_p1:
1141 /* loop until we find something that consumes a character */
1142 switch (*p1++)
1143 {
1144 case Cbol:
1145 case Ceol:
1146 case Cbegbuf:
1147 case Cendbuf:
1148 case Cwordbeg:
1149 case Cwordend:
1150 case Cwordbound:
1151 case Cnotwordbound:
1152#ifdef emacs
1153 case Cemacs_at_dot:
1154#endif /* emacs */
1155 goto loop_p1;
1156 case Cstart_memory:
1157 case Cend_memory:
1158 p1++;
1159 goto loop_p1;
1160 case Cexact:
1161 ch = (unsigned char)*p1++;
1162 if (map[ch])
1163 goto make_normal_jump;
1164 break;
1165 case Canychar:
1166 for (b = 0; b < 256; b++)
1167 if (b != '\n' && map[b])
1168 goto make_normal_jump;
1169 break;
1170 case Cset:
1171 for (b = 0; b < 256; b++)
1172 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
1173 goto make_normal_jump;
1174 p1 += 256/8;
1175 break;
1176 default:
1177 goto make_normal_jump;
1178 }
1179 /* now we know that we can't backtrack. */
1180 while (p1 != p2 - 3)
1181 {
1182 switch (*p1++)
1183 {
1184 case Cend:
1185 abort(); /* we certainly shouldn't get this inside loop */
1186 /*NOTREACHED*/
1187 case Cbol:
1188 case Ceol:
1189 case Canychar:
1190 case Cbegbuf:
1191 case Cendbuf:
1192 case Cwordbeg:
1193 case Cwordend:
1194 case Cwordbound:
1195 case Cnotwordbound:
1196#ifdef emacs
1197 case Cemacs_at_dot:
1198#endif /* emacs */
1199 break;
1200 case Cset:
1201 p1 += 256/8;
1202 break;
1203 case Cexact:
1204 case Cstart_memory:
1205 case Cend_memory:
1206 case Cmatch_memory:
1207 case Csyntaxspec:
1208 case Cnotsyntaxspec:
1209 p1++;
1210 break;
1211 case Cjump:
1212 case Cstar_jump:
1213 case Cfailure_jump:
1214 case Cupdate_failure_jump:
1215 case Cdummy_failure_jump:
1216 goto make_normal_jump;
1217 default:
1218 printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
1219 break;
1220 }
1221 }
1222 goto make_update_jump;
1223 }
1224 make_normal_jump:
1225 /* printf("changing to normal jump\n"); */
1226 code -= 3;
1227 *code = Cjump;
1228 break;
1229 make_update_jump:
1230 /* printf("changing to update jump\n"); */
1231 code -= 2;
1232 a += 3; /* jump to after the Cfailure_jump */
1233 code[-1] = Cupdate_failure_jump;
1234 code[0] = a & 0xff;
1235 code[1] = a >> 8;
1236 /* fall to next case */
1237 case Cupdate_failure_jump:
1238 failure_sp[-1].text = text;
1239 failure_sp[-1].partend = partend;
1240 /* fall to next case */
1241 case Cjump:
1242 a = (unsigned char)*code++;
1243 a |= (unsigned char)*code++ << 8;
1244 code += (int)(short)a;
1245 break;
1246 case Cdummy_failure_jump:
1247 case Cfailure_jump:
1248 if (failure_sp == failure_stack_end)
1249 {
1250 if (failure_stack_start != initial_failure_stack)
1251 goto error;
1252 failure_stack_start = (struct failure_point *)
1253 malloc(MAX_FAILURES * sizeof(*failure_stack_start));
Guido van Rossumc65a5251994-08-05 13:44:50 +00001254 if (failure_stack_start == NULL)
1255 {
1256 failure_stack_start = initial_failure_stack;
1257 goto error;
1258 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001259 failure_stack_end = failure_stack_start + MAX_FAILURES;
1260 memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
1261 INITIAL_FAILURES * sizeof(*failure_stack_start));
1262 failure_sp = failure_stack_start + INITIAL_FAILURES;
1263 }
1264 a = (unsigned char)*code++;
1265 a |= (unsigned char)*code++ << 8;
1266 a = (int)(short)a;
1267 if (code[-3] == Cdummy_failure_jump)
1268 { /* this is only used in plus */
1269 assert(*code == Cfailure_jump);
1270 b = (unsigned char)code[1];
1271 b |= (unsigned char)code[2] << 8;
1272 failure_sp->code = code + (int)(short)b + 3;
1273 failure_sp->text = NULL;
1274 code += a;
1275 }
1276 else
1277 {
1278 failure_sp->code = code + a;
1279 failure_sp->text = text;
1280 failure_sp->partend = partend;
1281 }
1282 failure_sp++;
1283 break;
1284 case Cbegbuf:
1285 if (text == string1)
1286 break;
1287 goto fail;
1288 case Cendbuf:
1289 if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
1290 break;
1291 goto fail;
1292 case Cwordbeg:
1293 if (text == string2 + size2)
1294 goto fail;
1295 if (size2 == 0 && text == string1 + size1)
1296 goto fail;
1297 if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
1298 goto fail;
1299 if (text == string1)
1300 break;
1301 if (SYNTAX(text[-1]) != Sword)
1302 break;
1303 goto fail;
1304 case Cwordend:
1305 if (text == string1)
1306 goto fail;
1307 if (SYNTAX(text[-1]) != Sword)
1308 goto fail;
1309 if (text == string2 + size2)
1310 break;
1311 if (size2 == 0 && text == string1 + size1)
1312 break;
1313 if (SYNTAX(*text) == Sword)
1314 goto fail;
1315 break;
1316 case Cwordbound:
1317 /* Note: as in gnu regexp, this also matches at the beginning
1318 and end of buffer. */
1319 if (text == string1 || text == string2 + size2 ||
1320 (size2 == 0 && text == string1 + size1))
1321 break;
1322 if ((SYNTAX(text[-1]) == Sword) ^
1323 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
1324 break;
1325 goto fail;
1326 case Cnotwordbound:
1327 /* Note: as in gnu regexp, this never matches at the beginning
1328 and end of buffer. */
1329 if (text == string1 || text == string2 + size2 ||
1330 (size2 == 0 && text == string1 + size1))
1331 goto fail;
1332 if (!((SYNTAX(text[-1]) == Sword) ^
1333 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
1334 goto fail;
1335 break;
1336 case Csyntaxspec:
1337 NEXTCHAR(ch);
1338 if (SYNTAX(ch) != (unsigned char)*code++)
1339 goto fail;
1340 break;
1341 case Cnotsyntaxspec:
1342 NEXTCHAR(ch);
1343 if (SYNTAX(ch) != (unsigned char)*code++)
1344 break;
1345 goto fail;
1346#ifdef emacs
1347 case Cemacs_at_dot:
1348 if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
1349 goto fail;
1350 break;
1351#endif /* emacs */
1352 default:
1353 abort();
1354 /*NOTREACHED*/
1355 }
1356 }
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001357#if 0 /* This line is never reached --Guido */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001358 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001359#endif
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001360 /*NOTREACHED*/
1361
1362 fail:
1363 if (failure_sp != failure_stack_start)
1364 {
1365 failure_sp--;
1366 text = failure_sp->text;
1367 if (text == NULL)
1368 goto fail;
1369 partend = failure_sp->partend;
1370 code = failure_sp->code;
1371 goto continue_matching;
1372 }
1373 if (failure_stack_start != initial_failure_stack)
1374 free((char *)failure_stack_start);
1375 return -1;
1376
1377 error:
1378 if (failure_stack_start != initial_failure_stack)
1379 free((char *)failure_stack_start);
1380 return -2;
1381}
1382
1383#undef PREFETCH
1384#undef NEXTCHAR
1385#undef PUSH_FAILURE
1386
1387int re_match(bufp, string, size, pos, regs)
1388regexp_t bufp;
1389char *string;
1390int size, pos;
1391regexp_registers_t regs;
1392{
1393 return re_match_2(bufp, string, size, (char *)NULL, 0, pos, regs, size);
1394}
1395
1396int re_search_2(bufp, string1, size1, string2, size2, pos, range, regs,
1397 mstop)
1398regexp_t bufp;
1399char *string1, *string2;
1400int size1, size2, pos, range, mstop;
1401regexp_registers_t regs;
1402{
1403 char *fastmap, *translate, *text, *partstart, *partend;
1404 int dir, ret;
1405 char anchor;
1406
1407 assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
Guido van Rossum521f81c1992-01-26 18:23:48 +00001408 assert(pos + range >= 0 && pos + range <= size1 + size2); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001409 assert(pos <= mstop);
1410
1411 fastmap = bufp->fastmap;
1412 translate = bufp->translate;
1413 if (fastmap && !bufp->fastmap_accurate)
1414 re_compile_fastmap(bufp);
1415 anchor = bufp->anchor;
1416 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1417 fastmap = NULL;
1418 if (range < 0)
1419 {
1420 dir = -1;
1421 range = -range;
1422 }
1423 else
1424 dir = 1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001425 if (anchor == 2)
1426 if (pos != 0)
1427 return -1;
1428 else
1429 range = 0;
1430 for (; range >= 0; range--, pos += dir)
1431 {
1432 if (fastmap)
1433 {
1434 if (dir == 1)
1435 { /* searching forwards */
1436 if (pos < size1)
1437 {
1438 text = string1 + pos;
1439 if (pos + range > size1)
1440 partend = string1 + size1;
1441 else
1442 partend = string1 + pos + range;
1443 }
1444 else
1445 {
1446 text = string2 + pos - size1;
1447 partend = string2 + pos + range - size1;
1448 }
1449 partstart = text;
1450 if (translate)
1451 while (text != partend &&
1452 !fastmap[(unsigned char)
1453 translate[(unsigned char)*text]])
1454 text++;
1455 else
1456 while (text != partend && !fastmap[(unsigned char)*text])
1457 text++;
1458 pos += text - partstart;
1459 range -= text - partstart;
1460 if (pos == size1 + size2 && bufp->can_be_null == 0)
1461 return -1;
1462 }
1463 else
1464 { /* searching backwards */
1465 if (pos <= size1)
1466 {
1467 text = string1 + pos;
1468 partstart = string1 + pos - range;
1469 }
1470 else
1471 {
1472 text = string2 + pos - size1;
1473 if (range < pos - size1)
1474 partstart = string2 + pos - size1 - range;
1475 else
1476 partstart = string2;
1477 }
1478 partend = text;
1479 if (translate)
1480 while (text != partstart &&
1481 !fastmap[(unsigned char)
1482 translate[(unsigned char)*text]])
1483 text--;
1484 else
1485 while (text != partstart &&
1486 !fastmap[(unsigned char)*text])
1487 text--;
1488 pos -= partend - text;
1489 range -= partend - text;
1490 }
1491 }
1492 if (anchor == 1)
1493 { /* anchored to begline */
1494 if (pos > 0 &&
1495 (pos <= size1 ? string1[pos - 1] :
1496 string2[pos - size1 - 1]) != '\n')
1497 continue;
1498 }
1499 assert(pos >= 0 && pos <= size1 + size2);
1500 ret = re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop);
1501 if (ret >= 0)
1502 return pos;
1503 if (ret == -2)
1504 return -2;
1505 }
1506 return -1;
1507}
1508
1509int re_search(bufp, string, size, startpos, range, regs)
1510regexp_t bufp;
1511char *string;
1512int size, startpos, range;
1513regexp_registers_t regs;
1514{
1515 return re_search_2(bufp, string, size, (char *)NULL, 0,
1516 startpos, range, regs, size);
1517}
1518
Guido van Rossum9abc5391992-03-27 17:24:37 +00001519#ifdef UNUSED
1520
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001521static struct re_pattern_buffer re_comp_buf;
1522
1523char *re_comp(s)
1524char *s;
1525{
1526 if (s == NULL)
1527 {
1528 if (!re_comp_buf.buffer)
1529 return "Out of memory";
1530 return NULL;
1531 }
1532 if (!re_comp_buf.buffer)
1533 {
1534 /* the buffer will be allocated automatically */
1535 re_comp_buf.fastmap = malloc(256);
1536 re_comp_buf.translate = NULL;
Guido van Rossumc65a5251994-08-05 13:44:50 +00001537 if (re_comp_buf.fastmap == NULL)
1538 return "Out of memory";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001539 }
1540 return re_compile_pattern(s, strlen(s), &re_comp_buf);
1541}
1542
1543int re_exec(s)
1544char *s;
1545{
1546 int len = strlen(s);
1547
1548 return re_search(&re_comp_buf, s, len, 0, len, (regexp_registers_t)NULL) >= 0;
1549}
1550
Guido van Rossum9abc5391992-03-27 17:24:37 +00001551#endif
1552
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001553#ifdef TEST_REGEXP
1554
1555int main()
1556{
1557 char buf[500];
1558 char *cp;
1559 struct re_pattern_buffer exp;
1560 struct re_registers regs;
1561 int a,pos;
1562 char fastmap[256];
1563
1564 exp.allocated = 0;
1565 exp.buffer = 0;
1566 exp.translate = NULL;
1567 exp.fastmap = fastmap;
1568
1569 /* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
1570
1571 while (1)
1572 {
1573 printf("Enter regexp:\n");
1574 gets(buf);
1575 cp=re_compile_pattern(buf, strlen(buf), &exp);
1576 if (cp)
1577 {
1578 printf("Error: %s\n", cp);
1579 continue;
1580 }
1581 re_compile_fastmap(&exp);
1582 printf("dump:\n");
1583 for (pos = 0; pos < exp.used;)
1584 {
1585 printf("%d: ", pos);
1586 switch (exp.buffer[pos++])
1587 {
1588 case Cend:
1589 strcpy(buf, "end");
1590 break;
1591 case Cbol:
1592 strcpy(buf, "bol");
1593 break;
1594 case Ceol:
1595 strcpy(buf, "eol");
1596 break;
1597 case Cset:
1598 strcpy(buf, "set ");
1599 for (a = 0; a < 256/8; a++)
1600 sprintf(buf+strlen(buf)," %02x",
1601 (unsigned char)exp.buffer[pos++]);
1602 break;
1603 case Cexact:
1604 sprintf(buf, "exact '%c' 0x%x", exp.buffer[pos],
1605 (unsigned char)exp.buffer[pos]);
1606 pos++;
1607 break;
1608 case Canychar:
1609 strcpy(buf, "anychar");
1610 break;
1611 case Cstart_memory:
1612 sprintf(buf, "start_memory %d", exp.buffer[pos++]);
1613 break;
1614 case Cend_memory:
1615 sprintf(buf, "end_memory %d", exp.buffer[pos++]);
1616 break;
1617 case Cmatch_memory:
1618 sprintf(buf, "match_memory %d", exp.buffer[pos++]);
1619 break;
1620 case Cjump:
1621 case Cdummy_failure_jump:
1622 case Cstar_jump:
1623 case Cfailure_jump:
1624 case Cupdate_failure_jump:
1625 a = (unsigned char)exp.buffer[pos++];
1626 a += (unsigned char)exp.buffer[pos++] << 8;
1627 a = (int)(short)a;
1628 switch (exp.buffer[pos-3])
1629 {
1630 case Cjump:
1631 cp = "jump";
1632 break;
1633 case Cstar_jump:
1634 cp = "star_jump";
1635 break;
1636 case Cfailure_jump:
1637 cp = "failure_jump";
1638 break;
1639 case Cupdate_failure_jump:
1640 cp = "update_failure_jump";
1641 break;
1642 case Cdummy_failure_jump:
1643 cp = "dummy_failure_jump";
1644 break;
1645 default:
1646 cp = "unknown jump";
1647 break;
1648 }
1649 sprintf(buf, "%s %d", cp, a + pos);
1650 break;
1651 case Cbegbuf:
1652 strcpy(buf,"begbuf");
1653 break;
1654 case Cendbuf:
1655 strcpy(buf,"endbuf");
1656 break;
1657 case Cwordbeg:
1658 strcpy(buf,"wordbeg");
1659 break;
1660 case Cwordend:
1661 strcpy(buf,"wordend");
1662 break;
1663 case Cwordbound:
1664 strcpy(buf,"wordbound");
1665 break;
1666 case Cnotwordbound:
1667 strcpy(buf,"notwordbound");
1668 break;
1669 default:
1670 sprintf(buf, "unknown code %d",
1671 (unsigned char)exp.buffer[pos - 1]);
1672 break;
1673 }
1674 printf("%s\n", buf);
1675 }
1676 printf("can_be_null = %d uses_registers = %d anchor = %d\n",
1677 exp.can_be_null, exp.uses_registers, exp.anchor);
1678
1679 printf("fastmap:");
1680 for (a = 0; a < 256; a++)
1681 if (exp.fastmap[a])
1682 printf(" %d", a);
1683 printf("\n");
1684 printf("Enter strings. An empty line terminates.\n");
1685 while (fgets(buf, sizeof(buf), stdin))
1686 {
1687 if (buf[0] == '\n')
1688 break;
1689 a = re_search(&exp, buf, strlen(buf), 0, strlen(buf), &regs);
1690 printf("search returns %d\n", a);
1691 if (a != -1)
1692 {
1693 for (a = 0; a < RE_NREGS; a++)
1694 {
1695 printf("buf %d: %d to %d\n", a, regs.start[a], regs.end[a]);
1696 }
1697 }
1698 }
1699 }
1700}
1701
1702#endif /* TEST_REGEXP */