blob: ccf515a12ae418838635d6831f2f7b69a8934fd2 [file] [log] [blame]
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001/*
2
3regexpr.c
4
5Author: Tatu Ylonen <ylo@ngs.fi>
6
7Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
8
9Permission to use, copy, modify, distribute, and sell this software
10and its documentation for any purpose is hereby granted without fee,
11provided that the above copyright notice appear in all copies. This
12software is provided "as is" without express or implied warranty.
13
14Created: Thu Sep 26 17:14:05 1991 ylo
15Last modified: Mon Nov 4 17:06:48 1991 ylo
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000016Ported to Think C: 19 Jan 1992 guido@cwi.nl
Guido van Rossumb674c3b1992-01-19 16:32:47 +000017
18This code draws many ideas from the regular expression packages by
19Henry Spencer of the University of Toronto and Richard Stallman of the
20Free Software Foundation.
21
22Emacs-specific code and syntax table code is almost directly borrowed
23from GNU regexp.
24
25$Header$
26
27*/
28
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000029#include "PROTO.h" /* For PROTO macro --Guido */
30
Guido van Rossumb674c3b1992-01-19 16:32:47 +000031#include <stdio.h>
32#include <assert.h>
33#include "regexpr.h"
34
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000035#ifdef THINK_C
36/* Think C on the Mac really needs these headers... --Guido */
37#include <stdlib.h>
38#include <string.h>
39#else
Guido van Rossumb674c3b1992-01-19 16:32:47 +000040char *malloc();
41void free();
42char *realloc();
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000043#endif
Guido van Rossumb674c3b1992-01-19 16:32:47 +000044
45#define MACRO_BEGIN do {
46#define MACRO_END } while (0)
47
48enum regexp_compiled_ops /* opcodes for compiled regexp */
49{
50 Cend, /* end of pattern reached */
51 Cbol, /* beginning of line */
52 Ceol, /* end of line */
53 Cset, /* character set. Followed by 32 bytes of set. */
54 Cexact, /* followed by a byte to match */
55 Canychar, /* matches any character except newline */
56 Cstart_memory, /* set register start addr (followed by reg number) */
57 Cend_memory, /* set register end addr (followed by reg number) */
58 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
59 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
60 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
61 Cfailure_jump, /* jump to addr on failure */
62 Cupdate_failure_jump, /* update topmost failure point and jump */
63 Cdummy_failure_jump, /* push a dummy failure point and jump */
64 Cbegbuf, /* match at beginning of buffer */
65 Cendbuf, /* match at end of buffer */
66 Cwordbeg, /* match at beginning of word */
67 Cwordend, /* match at end of word */
68 Cwordbound, /* match if at word boundary */
69 Cnotwordbound, /* match if not at word boundary */
70#ifdef emacs
71 Cemacs_at_dot, /* emacs only: matches at dot */
72#endif /* emacs */
73 Csyntaxspec, /* matches syntax code (1 byte follows) */
74 Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
75};
76
77enum regexp_syntax_op /* syntax codes for plain and quoted characters */
78{
79 Rend, /* special code for end of regexp */
80 Rnormal, /* normal character */
81 Ranychar, /* any character except newline */
82 Rquote, /* the quote character */
83 Rbol, /* match beginning of line */
84 Reol, /* match end of line */
85 Roptional, /* match preceding expression optionally */
86 Rstar, /* match preceding expr zero or more times */
87 Rplus, /* match preceding expr one or more times */
88 Ror, /* match either of alternatives */
89 Ropenpar, /* opening parenthesis */
90 Rclosepar, /* closing parenthesis */
91 Rmemory, /* match memory register */
92 Rextended_memory, /* \vnn to match registers 10-99 */
93 Ropenset, /* open set. Internal syntax hard-coded below. */
94 /* the following are gnu extensions to "normal" regexp syntax */
95 Rbegbuf, /* beginning of buffer */
96 Rendbuf, /* end of buffer */
97 Rwordchar, /* word character */
98 Rnotwordchar, /* not word character */
99 Rwordbeg, /* beginning of word */
100 Rwordend, /* end of word */
101 Rwordbound, /* word bound */
102 Rnotwordbound, /* not word bound */
103#ifdef emacs
104 Remacs_at_dot, /* emacs: at dot */
105 Remacs_syntaxspec, /* syntaxspec */
106 Remacs_notsyntaxspec, /* notsyntaxspec */
107#endif /* emacs */
108 Rnum_ops
109};
110
111static int re_compile_initialized = 0;
112static int regexp_syntax = 0;
113static unsigned char regexp_plain_ops[256];
114static unsigned char regexp_quoted_ops[256];
115static unsigned char regexp_precedences[Rnum_ops];
116static int regexp_context_indep_ops;
117static int regexp_ansi_sequences;
118
119#define NUM_LEVELS 5 /* number of precedence levels in use */
120#define MAX_NESTING 100 /* max nesting level of operators */
121
122#ifdef emacs
123
124/* This code is for emacs compatibility only. */
125
126#include "config.h"
127#include "lisp.h"
128#include "buffer.h"
129#include "syntax.h"
130
131/* emacs defines NULL in some strange way? */
132#undef NULL
133#define NULL 0
134
135#else /* emacs */
136
137#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
138#define Sword 1
139
140#ifdef SYNTAX_TABLE
141char *re_syntax_table;
142#else
143static char re_syntax_table[256];
144#endif /* SYNTAX_TABLE */
145
146#endif /* emacs */
147
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000148static void re_compile_initialize PROTO((void));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000149static void re_compile_initialize()
150{
151 int a;
152
153#if !defined(emacs) && !defined(SYNTAX_TABLE)
154 static int syntax_table_inited = 0;
155
156 if (!syntax_table_inited)
157 {
158 syntax_table_inited = 1;
159 memset(re_syntax_table, 0, 256);
160 for (a = 'a'; a <= 'z'; a++)
161 re_syntax_table[a] = Sword;
162 for (a = 'A'; a <= 'Z'; a++)
163 re_syntax_table[a] = Sword;
164 for (a = '0'; a <= '9'; a++)
165 re_syntax_table[a] = Sword;
166 }
167#endif /* !emacs && !SYNTAX_TABLE */
168 re_compile_initialized = 1;
169 for (a = 0; a < 256; a++)
170 {
171 regexp_plain_ops[a] = Rnormal;
172 regexp_quoted_ops[a] = Rnormal;
173 }
174 for (a = '0'; a <= '9'; a++)
175 regexp_quoted_ops[a] = Rmemory;
176 regexp_plain_ops['\134'] = Rquote;
177 if (regexp_syntax & RE_NO_BK_PARENS)
178 {
179 regexp_plain_ops['('] = Ropenpar;
180 regexp_plain_ops[')'] = Rclosepar;
181 }
182 else
183 {
184 regexp_quoted_ops['('] = Ropenpar;
185 regexp_quoted_ops[')'] = Rclosepar;
186 }
187 if (regexp_syntax & RE_NO_BK_VBAR)
188 regexp_plain_ops['\174'] = Ror;
189 else
190 regexp_quoted_ops['\174'] = Ror;
191 regexp_plain_ops['*'] = Rstar;
192 if (regexp_syntax & RE_BK_PLUS_QM)
193 {
194 regexp_quoted_ops['+'] = Rplus;
195 regexp_quoted_ops['?'] = Roptional;
196 }
197 else
198 {
199 regexp_plain_ops['+'] = Rplus;
200 regexp_plain_ops['?'] = Roptional;
201 }
202 if (regexp_syntax & RE_NEWLINE_OR)
203 regexp_plain_ops['\n'] = Ror;
204 regexp_plain_ops['\133'] = Ropenset;
205 regexp_plain_ops['\136'] = Rbol;
206 regexp_plain_ops['$'] = Reol;
207 regexp_plain_ops['.'] = Ranychar;
208 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
209 {
210#ifdef emacs
211 regexp_quoted_ops['='] = Remacs_at_dot;
212 regexp_quoted_ops['s'] = Remacs_syntaxspec;
213 regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
214#endif /* emacs */
215 regexp_quoted_ops['w'] = Rwordchar;
216 regexp_quoted_ops['W'] = Rnotwordchar;
217 regexp_quoted_ops['<'] = Rwordbeg;
218 regexp_quoted_ops['>'] = Rwordend;
219 regexp_quoted_ops['b'] = Rwordbound;
220 regexp_quoted_ops['B'] = Rnotwordbound;
221 regexp_quoted_ops['`'] = Rbegbuf;
222 regexp_quoted_ops['\''] = Rendbuf;
223 }
224 if (regexp_syntax & RE_ANSI_HEX)
225 regexp_quoted_ops['v'] = Rextended_memory;
226 for (a = 0; a < Rnum_ops; a++)
227 regexp_precedences[a] = 4;
228 if (regexp_syntax & RE_TIGHT_VBAR)
229 {
230 regexp_precedences[Ror] = 3;
231 regexp_precedences[Rbol] = 2;
232 regexp_precedences[Reol] = 2;
233 }
234 else
235 {
236 regexp_precedences[Ror] = 2;
237 regexp_precedences[Rbol] = 3;
238 regexp_precedences[Reol] = 3;
239 }
240 regexp_precedences[Rclosepar] = 1;
241 regexp_precedences[Rend] = 0;
242 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
243 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
244}
245
246int re_set_syntax(syntax)
247int syntax;
248{
249 int ret;
250
251 ret = regexp_syntax;
252 regexp_syntax = syntax;
253 re_compile_initialize();
254 return ret;
255}
256
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000257static int hex_char_to_decimal PROTO((int));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000258static int hex_char_to_decimal(ch)
259int ch;
260{
261 if (ch >= '0' && ch <= '9')
262 return ch - '0';
263 if (ch >= 'a' && ch <= 'f')
264 return ch - 'a' + 10;
265 if (ch >= 'A' && ch <= 'F')
266 return ch - 'A' + 10;
267 return 16;
268}
269
270char *re_compile_pattern(regex, size, bufp)
271char *regex;
272int size;
273regexp_t bufp;
274{
275 int a, pos, op, current_level, level, opcode;
276 int pattern_offset, alloc;
277 int starts[NUM_LEVELS * MAX_NESTING], starts_base;
278 int future_jumps[MAX_NESTING], num_jumps;
279 unsigned char ch;
280 char *pattern, *translate;
281 int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
282 int beginning_context;
283
284#define NEXTCHAR(var) \
285 MACRO_BEGIN \
286 if (pos >= size) \
287 goto ends_prematurely; \
288 (var) = regex[pos]; \
289 pos++; \
290 MACRO_END
291
292#define ALLOC(amount) \
293 MACRO_BEGIN \
294 if (pattern_offset+(amount) > alloc) \
295 { \
296 alloc += 256 + (amount); \
297 pattern = realloc(pattern, alloc); \
298 if (!pattern) \
299 goto out_of_memory; \
300 } \
301 MACRO_END
302
303#define STORE(ch) pattern[pattern_offset++] = (ch)
304
305#define CURRENT_LEVEL_START (starts[starts_base + current_level])
306
307#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
308
309#define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
310 starts_base += NUM_LEVELS; \
311 else \
312 goto too_complex
313
314#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
315
316#define PUT_ADDR(offset,addr) \
317 MACRO_BEGIN \
318 int disp = (addr) - (offset) - 2; \
319 pattern[(offset)] = disp & 0xff; \
320 pattern[(offset)+1] = (disp>>8) & 0xff; \
321 MACRO_END
322
323#define INSERT_JUMP(pos,type,addr) \
324 MACRO_BEGIN \
325 int a, p = (pos), t = (type), ad = (addr); \
326 for (a = pattern_offset - 1; a >= p; a--) \
327 pattern[a + 3] = pattern[a]; \
328 pattern[p] = t; \
329 PUT_ADDR(p+1,ad); \
330 pattern_offset += 3; \
331 MACRO_END
332
333#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
334
335#define SET_FIELDS \
336 MACRO_BEGIN \
337 bufp->allocated = alloc; \
338 bufp->buffer = pattern; \
339 bufp->used = pattern_offset; \
340 MACRO_END
341
342#define GETHEX(var) \
343 MACRO_BEGIN \
344 char gethex_ch, gethex_value; \
345 NEXTCHAR(gethex_ch); \
346 gethex_value = hex_char_to_decimal(gethex_ch); \
347 if (gethex_value == 16) \
348 goto hex_error; \
349 NEXTCHAR(gethex_ch); \
350 gethex_ch = hex_char_to_decimal(gethex_ch); \
351 if (gethex_ch == 16) \
352 goto hex_error; \
353 (var) = gethex_value * 16 + gethex_ch; \
354 MACRO_END
355
356#define ANSI_TRANSLATE(ch) \
357 MACRO_BEGIN \
358 switch (ch) \
359 { \
360 case 'a': \
361 case 'A': \
362 ch = 7; /* audible bell */ \
363 break; \
364 case 'b': \
365 case 'B': \
366 ch = 8; /* backspace */ \
367 break; \
368 case 'f': \
369 case 'F': \
370 ch = 12; /* form feed */ \
371 break; \
372 case 'n': \
373 case 'N': \
374 ch = 10; /* line feed */ \
375 break; \
376 case 'r': \
377 case 'R': \
378 ch = 13; /* carriage return */ \
379 break; \
380 case 't': \
381 case 'T': \
382 ch = 9; /* tab */ \
383 break; \
384 case 'v': \
385 case 'V': \
386 ch = 11; /* vertical tab */ \
387 break; \
388 case 'x': /* hex code */ \
389 case 'X': \
390 GETHEX(ch); \
391 break; \
392 default: \
393 /* other characters passed through */ \
394 if (translate) \
395 ch = translate[(unsigned char)ch]; \
396 break; \
397 } \
398 MACRO_END
399
400 if (!re_compile_initialized)
401 re_compile_initialize();
402 bufp->used = 0;
403 bufp->fastmap_accurate = 0;
404 bufp->uses_registers = 0;
405 translate = bufp->translate;
406 pattern = bufp->buffer;
407 alloc = bufp->allocated;
408 if (alloc == 0 || pattern == NULL)
409 {
410 alloc = 256;
411 pattern = malloc(alloc);
412 if (!pattern)
413 goto out_of_memory;
414 }
415 pattern_offset = 0;
416 starts_base = 0;
417 num_jumps = 0;
418 current_level = 0;
419 SET_LEVEL_START;
420 num_open_registers = 0;
421 next_register = 1;
422 paren_depth = 0;
423 beginning_context = 1;
424 op = -1;
425 /* we use Rend dummy to ensure that pending jumps are updated (due to
426 low priority of Rend) before exiting the loop. */
427 pos = 0;
428 while (op != Rend)
429 {
430 if (pos >= size)
431 op = Rend;
432 else
433 {
434 NEXTCHAR(ch);
435 if (translate)
436 ch = translate[(unsigned char)ch];
437 op = regexp_plain_ops[(unsigned char)ch];
438 if (op == Rquote)
439 {
440 NEXTCHAR(ch);
441 op = regexp_quoted_ops[(unsigned char)ch];
442 if (op == Rnormal && regexp_ansi_sequences)
443 ANSI_TRANSLATE(ch);
444 }
445 }
446 level = regexp_precedences[op];
447 /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
448 ch, op, level, current_level, CURRENT_LEVEL_START); */
449 if (level > current_level)
450 {
451 for (current_level++; current_level < level; current_level++)
452 SET_LEVEL_START;
453 SET_LEVEL_START;
454 }
455 else
456 if (level < current_level)
457 {
458 current_level = level;
459 for (;num_jumps > 0 &&
460 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
461 num_jumps--)
462 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
463 }
464 switch (op)
465 {
466 case Rend:
467 break;
468 case Rnormal:
469 normal_char:
470 opcode = Cexact;
471 store_opcode_and_arg: /* opcode & ch must be set */
472 SET_LEVEL_START;
473 ALLOC(2);
474 STORE(opcode);
475 STORE(ch);
476 break;
477 case Ranychar:
478 opcode = Canychar;
479 store_opcode:
480 SET_LEVEL_START;
481 ALLOC(1);
482 STORE(opcode);
483 break;
484 case Rquote:
485 abort();
486 /*NOTREACHED*/
487 case Rbol:
488 if (!beginning_context)
489 if (regexp_context_indep_ops)
490 goto op_error;
491 else
492 goto normal_char;
493 opcode = Cbol;
494 goto store_opcode;
495 case Reol:
496 if (!((pos >= size) ||
497 ((regexp_syntax & RE_NO_BK_VBAR) ?
498 (regex[pos] == '\174') :
499 (pos+1 < size && regex[pos] == '\134' &&
500 regex[pos+1] == '\174')) ||
501 ((regexp_syntax & RE_NO_BK_PARENS)?
502 (regex[pos] == ')'):
503 (pos+1 < size && regex[pos] == '\134' &&
504 regex[pos+1] == ')'))))
505 if (regexp_context_indep_ops)
506 goto op_error;
507 else
508 goto normal_char;
509 opcode = Ceol;
510 goto store_opcode;
511 break;
512 case Roptional:
513 if (beginning_context)
514 if (regexp_context_indep_ops)
515 goto op_error;
516 else
517 goto normal_char;
518 if (CURRENT_LEVEL_START == pattern_offset)
519 break; /* ignore empty patterns for ? */
520 ALLOC(3);
521 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
522 pattern_offset + 3);
523 break;
524 case Rstar:
525 case Rplus:
526 if (beginning_context)
527 if (regexp_context_indep_ops)
528 goto op_error;
529 else
530 goto normal_char;
531 if (CURRENT_LEVEL_START == pattern_offset)
532 break; /* ignore empty patterns for + and * */
533 ALLOC(9);
534 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
535 pattern_offset + 6);
536 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
537 if (op == Rplus) /* jump over initial failure_jump */
538 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
539 CURRENT_LEVEL_START + 6);
540 break;
541 case Ror:
542 ALLOC(6);
543 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
544 pattern_offset + 6);
545 if (num_jumps >= MAX_NESTING)
546 goto too_complex;
547 STORE(Cjump);
548 future_jumps[num_jumps++] = pattern_offset;
549 STORE(0);
550 STORE(0);
551 SET_LEVEL_START;
552 break;
553 case Ropenpar:
554 SET_LEVEL_START;
555 if (next_register < RE_NREGS)
556 {
557 bufp->uses_registers = 1;
558 ALLOC(2);
559 STORE(Cstart_memory);
560 STORE(next_register);
561 open_registers[num_open_registers++] = next_register;
562 next_register++;
563 }
564 paren_depth++;
565 PUSH_LEVEL_STARTS;
566 current_level = 0;
567 SET_LEVEL_START;
568 break;
569 case Rclosepar:
570 if (paren_depth <= 0)
571 goto parenthesis_error;
572 POP_LEVEL_STARTS;
573 current_level = regexp_precedences[Ropenpar];
574 paren_depth--;
575 if (paren_depth < num_open_registers)
576 {
577 bufp->uses_registers = 1;
578 ALLOC(2);
579 STORE(Cend_memory);
580 num_open_registers--;
581 STORE(open_registers[num_open_registers]);
582 }
583 break;
584 case Rmemory:
585 if (ch == '0')
586 goto bad_match_register;
587 assert(ch >= '0' && ch <= '9');
588 bufp->uses_registers = 1;
589 opcode = Cmatch_memory;
590 ch -= '0';
591 goto store_opcode_and_arg;
592 case Rextended_memory:
593 NEXTCHAR(ch);
594 if (ch < '0' || ch > '9')
595 goto bad_match_register;
596 NEXTCHAR(a);
597 if (a < '0' || a > '9')
598 goto bad_match_register;
599 ch = 10 * (a - '0') + ch - '0';
600 if (ch <= 0 || ch >= RE_NREGS)
601 goto bad_match_register;
602 bufp->uses_registers = 1;
603 opcode = Cmatch_memory;
604 goto store_opcode_and_arg;
605 case Ropenset:
606 {
607 int complement,prev,offset,range,firstchar;
608
609 SET_LEVEL_START;
610 ALLOC(1+256/8);
611 STORE(Cset);
612 offset = pattern_offset;
613 for (a = 0; a < 256/8; a++)
614 STORE(0);
615 NEXTCHAR(ch);
616 if (translate)
617 ch = translate[(unsigned char)ch];
618 if (ch == '\136')
619 {
620 complement = 1;
621 NEXTCHAR(ch);
622 if (translate)
623 ch = translate[(unsigned char)ch];
624 }
625 else
626 complement = 0;
627 prev = -1;
628 range = 0;
629 firstchar = 1;
630 while (ch != '\135' || firstchar)
631 {
632 firstchar = 0;
633 if (regexp_ansi_sequences && ch == '\134')
634 {
635 NEXTCHAR(ch);
636 ANSI_TRANSLATE(ch);
637 }
638 if (range)
639 {
640 for (a = prev; a <= ch; a++)
641 SETBIT(pattern, offset, a);
642 prev = -1;
643 range = 0;
644 }
645 else
646 if (prev != -1 && ch == '-')
647 range = 1;
648 else
649 {
650 SETBIT(pattern, offset, ch);
651 prev = ch;
652 }
653 NEXTCHAR(ch);
654 if (translate)
655 ch = translate[(unsigned char)ch];
656 }
657 if (range)
658 SETBIT(pattern, offset, '-');
659 if (complement)
660 {
661 for (a = 0; a < 256/8; a++)
662 pattern[offset+a] ^= 0xff;
663 }
664 break;
665 }
666 case Rbegbuf:
667 opcode = Cbegbuf;
668 goto store_opcode;
669 case Rendbuf:
670 opcode = Cendbuf;
671 goto store_opcode;
672 case Rwordchar:
673 opcode = Csyntaxspec;
674 ch = Sword;
675 goto store_opcode_and_arg;
676 case Rnotwordchar:
677 opcode = Cnotsyntaxspec;
678 ch = Sword;
679 goto store_opcode_and_arg;
680 case Rwordbeg:
681 opcode = Cwordbeg;
682 goto store_opcode;
683 case Rwordend:
684 opcode = Cwordend;
685 goto store_opcode;
686 case Rwordbound:
687 opcode = Cwordbound;
688 goto store_opcode;
689 case Rnotwordbound:
690 opcode = Cnotwordbound;
691 goto store_opcode;
692#ifdef emacs
693 case Remacs_at_dot:
694 opcode = Cemacs_at_dot;
695 goto store_opcode;
696 case Remacs_syntaxspec:
697 NEXTCHAR(ch);
698 if (translate)
699 ch = translate[(unsigned char)ch];
700 opcode = Csyntaxspec;
701 ch = syntax_spec_code[(unsigned char)ch];
702 goto store_opcode_and_arg;
703 case Remacs_notsyntaxspec:
704 NEXTCHAR(ch);
705 if (translate)
706 ch = translate[(unsigned char)ch];
707 opcode = Cnotsyntaxspec;
708 ch = syntax_spec_code[(unsigned char)ch];
709 goto store_opcode_and_arg;
710#endif /* emacs */
711 default:
712 abort();
713 }
714 beginning_context = (op == Ropenpar || op == Ror);
715 }
716 if (starts_base != 0)
717 goto parenthesis_error;
718 assert(num_jumps == 0);
719 ALLOC(1);
720 STORE(Cend);
721 SET_FIELDS;
722 return NULL;
723
724 op_error:
725 SET_FIELDS;
726 return "Badly placed special character";
727
728 bad_match_register:
729 SET_FIELDS;
730 return "Bad match register number";
731
732 hex_error:
733 SET_FIELDS;
734 return "Bad hexadecimal number";
735
736 parenthesis_error:
737 SET_FIELDS;
738 return "Badly placed parenthesis";
739
740 out_of_memory:
741 SET_FIELDS;
742 return "Out of memory";
743
744 ends_prematurely:
745 SET_FIELDS;
746 return "Regular expression ends prematurely";
747
748 too_complex:
749 SET_FIELDS;
750 return "Regular expression too complex";
751}
752#undef CHARAT
753#undef NEXTCHAR
754#undef GETHEX
755#undef ALLOC
756#undef STORE
757#undef CURRENT_LEVEL_START
758#undef SET_LEVEL_START
759#undef PUSH_LEVEL_STARTS
760#undef POP_LEVEL_STARTS
761#undef PUT_ADDR
762#undef INSERT_JUMP
763#undef SETBIT
764#undef SET_FIELDS
765
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000766static void re_compile_fastmap_aux
767 PROTO((char *, int, char *, char *, char *));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000768static void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
769char *code, *visited, *can_be_null, *fastmap;
770int pos;
771{
772 int a, b, syntaxcode;
773
774 if (visited[pos])
775 return; /* we have already been here */
776 visited[pos] = 1;
777 for (;;)
778 switch (code[pos++])
779 {
780 case Cend:
781 *can_be_null = 1;
782 return;
783 case Cbol:
784 case Cbegbuf:
785 case Cendbuf:
786 case Cwordbeg:
787 case Cwordend:
788 case Cwordbound:
789 case Cnotwordbound:
790#ifdef emacs
791 case Cemacs_at_dot:
792#endif /* emacs */
793 break;
794 case Csyntaxspec:
795 syntaxcode = code[pos++];
796 for (a = 0; a < 256; a++)
797 if (SYNTAX(a) == syntaxcode)
798 fastmap[a] = 1;
799 return;
800 case Cnotsyntaxspec:
801 syntaxcode = code[pos++];
802 for (a = 0; a < 256; a++)
803 if (SYNTAX(a) != syntaxcode)
804 fastmap[a] = 1;
805 return;
806 case Ceol:
807 fastmap['\n'] = 1;
808 if (*can_be_null == 0)
809 *can_be_null = 2; /* can match null, but only at end of buffer*/
810 return;
811 case Cset:
812 for (a = 0; a < 256/8; a++)
813 if (code[pos + a] != 0)
814 for (b = 0; b < 8; b++)
815 if (code[pos + a] & (1 << b))
816 fastmap[(a << 3) + b] = 1;
817 pos += 256/8;
818 return;
819 case Cexact:
820 fastmap[(unsigned char)code[pos]] = 1;
821 return;
822 case Canychar:
823 for (a = 0; a < 256; a++)
824 if (a != '\n')
825 fastmap[a] = 1;
826 return;
827 case Cstart_memory:
828 case Cend_memory:
829 pos++;
830 break;
831 case Cmatch_memory:
832 /* should this ever happen for sensible patterns??? */
833 *can_be_null = 1;
834 return;
835 case Cjump:
836 case Cdummy_failure_jump:
837 case Cupdate_failure_jump:
838 case Cstar_jump:
839 a = (unsigned char)code[pos++];
840 a |= (unsigned char)code[pos++] << 8;
841 pos += (int)(short)a;
842 if (visited[pos])
843 {
844 /* argh... the regexp contains empty loops. This is not
845 good, as this may cause a failure stack overflow when
846 matching. Oh well. */
847 /* this path leads nowhere; pursue other paths. */
848 return;
849 }
850 visited[pos] = 1;
851 break;
852 case Cfailure_jump:
853 a = (unsigned char)code[pos++];
854 a |= (unsigned char)code[pos++] << 8;
855 a = pos + (int)(short)a;
856 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
857 break;
858 default:
859 abort(); /* probably some opcode is missing from this switch */
860 /*NOTREACHED*/
861 }
862}
863
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000864static int re_do_compile_fastmap PROTO((char *, int, int, char *, char *));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000865static int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
866char *buffer, *fastmap, *can_be_null;
867int used, pos;
868{
869 char small_visited[512], *visited;
870
871 if (used <= sizeof(small_visited))
872 visited = small_visited;
873 else
874 {
875 visited = malloc(used);
876 if (!visited)
877 return 0;
878 }
879 *can_be_null = 0;
880 memset(fastmap, 0, 256);
881 memset(visited, 0, used);
882 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
883 if (visited != small_visited)
884 free(visited);
885 return 1;
886}
887
888void re_compile_fastmap(bufp)
889regexp_t bufp;
890{
891 if (!bufp->fastmap || bufp->fastmap_accurate)
892 return;
893 assert(bufp->used > 0);
894 if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
895 bufp->fastmap))
896 return;
897 if (bufp->buffer[0] == Cbol)
898 bufp->anchor = 1; /* begline */
899 else
900 if (bufp->buffer[0] == Cbegbuf)
901 bufp->anchor = 2; /* begbuf */
902 else
903 bufp->anchor = 0; /* none */
904 bufp->fastmap_accurate = 1;
905}
906
907#define INITIAL_FAILURES 128 /* initial # failure points to allocate */
908#define MAX_FAILURES 4100 /* max # of failure points before failing */
909
910int re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop)
911regexp_t bufp;
912char *string1, *string2;
913int size1, size2, pos, mstop;
914regexp_registers_t regs;
915{
916 struct failure_point { char *text, *partend, *code; }
917 *failure_stack_start, *failure_sp, *failure_stack_end,
918 initial_failure_stack[INITIAL_FAILURES];
919 char *code, *translate, *text, *textend, *partend, *part_2_end;
920 char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
921 char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
922 int a, b, ch, reg, regch, match_end;
923 char *regtext, *regpartend, *regtextend;
924
925#define PREFETCH \
926 MACRO_BEGIN \
927 if (text == partend) \
928 { \
929 if (text == textend) \
930 goto fail; \
931 text = string2; \
932 partend = part_2_end; \
933 } \
934 MACRO_END
935
936#define NEXTCHAR(var) \
937 MACRO_BEGIN \
938 PREFETCH; \
939 (var) = (unsigned char)*text++; \
940 if (translate) \
941 (var) = (unsigned char)translate[(var)]; \
942 MACRO_END
943
944 assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
945 assert(mstop <= size1 + size2);
946 assert(pos <= mstop);
947
948 if (pos <= size1)
949 {
950 text = string1 + pos;
951 if (mstop <= size1)
952 {
953 partend = string1 + mstop;
954 textend = partend;
955 }
956 else
957 {
958 partend = string1 + size1;
959 textend = string2 + mstop - size1;
960 }
961 part_2_end = string2 + mstop - size1;
962 }
963 else
964 {
965 text = string2 + pos - size1;
966 partend = string2 + mstop - size1;
967 textend = partend;
968 part_2_end = partend;
969 }
970
971 if (bufp->uses_registers && regs != NULL)
972 for (a = 0; a < RE_NREGS; a++)
973 regend_text[a] = NULL;
974
975 code = bufp->buffer;
976 translate = bufp->translate;
977 failure_stack_start = failure_sp = initial_failure_stack;
978 failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
979
980#if 0
981 /* re_search_2 has already done this, and otherwise we get little benefit
982 from this. So I'll leave this out. */
983 if (bufp->fastmap_accurate && !bufp->can_be_null &&
984 text != textend &&
985 !bufp->fastmap[translate ?
986 (unsigned char)translate[(unsigned char)*text] :
987 (unsigned char)*text])
988 return -1; /* it can't possibly match */
989#endif
990
991 continue_matching:
992 for (;;)
993 {
994 switch (*code++)
995 {
996 case Cend:
997 if (partend != part_2_end)
998 match_end = text - string1;
999 else
1000 match_end = text - string2 + size1;
1001 if (regs)
1002 {
1003 regs->start[0] = pos;
1004 regs->end[0] = match_end;
1005 if (!bufp->uses_registers)
1006 {
1007 for (a = 1; a < RE_NREGS; a++)
1008 {
1009 regs->start[a] = -1;
1010 regs->end[a] = -1;
1011 }
1012 }
1013 else
1014 {
1015 for (a = 1; a < RE_NREGS; a++)
1016 {
1017 if (regend_text[a] == NULL)
1018 {
1019 regs->start[a] = -1;
1020 regs->end[a] = -1;
1021 continue;
1022 }
1023 if (regstart_partend[a] != part_2_end)
1024 regs->start[a] = regstart_text[a] - string1;
1025 else
1026 regs->start[a] = regstart_text[a] - string2 + size1;
1027 if (regend_partend[a] != part_2_end)
1028 regs->end[a] = regend_text[a] - string1;
1029 else
1030 regs->end[a] = regend_text[a] - string2 + size1;
1031 }
1032 }
1033 }
1034 if (failure_stack_start != initial_failure_stack)
1035 free((char *)failure_stack_start);
1036 return match_end - pos;
1037 case Cbol:
1038 if (text == string1 || text[-1] == '\n') /* text[-1] always valid */
1039 break;
1040 goto fail;
1041 case Ceol:
1042 if (text == string2 + size2 ||
1043 (text == string1 + size1 ?
1044 (size2 == 0 || *string2 == '\n') :
1045 *text == '\n'))
1046 break;
1047 goto fail;
1048 case Cset:
1049 NEXTCHAR(ch);
1050 if (code[ch/8] & (1<<(ch & 7)))
1051 {
1052 code += 256/8;
1053 break;
1054 }
1055 goto fail;
1056 case Cexact:
1057 NEXTCHAR(ch);
1058 if (ch != (unsigned char)*code++)
1059 goto fail;
1060 break;
1061 case Canychar:
1062 NEXTCHAR(ch);
1063 if (ch == '\n')
1064 goto fail;
1065 break;
1066 case Cstart_memory:
1067 reg = *code++;
1068 regstart_text[reg] = text;
1069 regstart_partend[reg] = partend;
1070 break;
1071 case Cend_memory:
1072 reg = *code++;
1073 regend_text[reg] = text;
1074 regend_partend[reg] = partend;
1075 break;
1076 case Cmatch_memory:
1077 reg = *code++;
1078 if (regend_text[reg] == NULL)
1079 goto fail; /* or should we just match nothing? */
1080 regtext = regstart_text[reg];
1081 regtextend = regend_text[reg];
1082 if (regstart_partend[reg] == regend_partend[reg])
1083 regpartend = regtextend;
1084 else
1085 regpartend = string1 + size1;
1086
1087 for (;regtext != regtextend;)
1088 {
1089 NEXTCHAR(ch);
1090 if (regtext == regpartend)
1091 regtext = string2;
1092 regch = (unsigned char)*regtext++;
1093 if (translate)
1094 regch = (unsigned char)translate[regch];
1095 if (regch != ch)
1096 goto fail;
1097 }
1098 break;
1099 case Cstar_jump:
1100 /* star is coded as:
1101 1: failure_jump 2
1102 ... code for operand of star
1103 star_jump 1
1104 2: ... code after star
1105 We change the star_jump to update_failure_jump if we can determine
1106 that it is safe to do so; otherwise we change it to an ordinary
1107 jump.
1108 plus is coded as
1109 jump 2
1110 1: failure_jump 3
1111 2: ... code for operand of plus
1112 star_jump 1
1113 3: ... code after plus
1114 For star_jump considerations this is processed identically
1115 to star. */
1116 a = (unsigned char)*code++;
1117 a |= (unsigned char)*code++ << 8;
1118 a = (int)(short)a;
1119 {
1120 char map[256], can_be_null;
1121 char *p1, *p2;
1122
1123 p1 = code + a + 3; /* skip the failure_jump */
1124 assert(p1[-3] == Cfailure_jump);
1125 p2 = code;
1126 /* p1 points inside loop, p2 points to after loop */
1127 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
1128 p2 - bufp->buffer, &can_be_null, map))
1129 goto make_normal_jump;
1130 /* If we might introduce a new update point inside the loop,
1131 we can't optimize because then update_jump would update a
1132 wrong failure point. Thus we have to be quite careful here. */
1133 loop_p1:
1134 /* loop until we find something that consumes a character */
1135 switch (*p1++)
1136 {
1137 case Cbol:
1138 case Ceol:
1139 case Cbegbuf:
1140 case Cendbuf:
1141 case Cwordbeg:
1142 case Cwordend:
1143 case Cwordbound:
1144 case Cnotwordbound:
1145#ifdef emacs
1146 case Cemacs_at_dot:
1147#endif /* emacs */
1148 goto loop_p1;
1149 case Cstart_memory:
1150 case Cend_memory:
1151 p1++;
1152 goto loop_p1;
1153 case Cexact:
1154 ch = (unsigned char)*p1++;
1155 if (map[ch])
1156 goto make_normal_jump;
1157 break;
1158 case Canychar:
1159 for (b = 0; b < 256; b++)
1160 if (b != '\n' && map[b])
1161 goto make_normal_jump;
1162 break;
1163 case Cset:
1164 for (b = 0; b < 256; b++)
1165 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
1166 goto make_normal_jump;
1167 p1 += 256/8;
1168 break;
1169 default:
1170 goto make_normal_jump;
1171 }
1172 /* now we know that we can't backtrack. */
1173 while (p1 != p2 - 3)
1174 {
1175 switch (*p1++)
1176 {
1177 case Cend:
1178 abort(); /* we certainly shouldn't get this inside loop */
1179 /*NOTREACHED*/
1180 case Cbol:
1181 case Ceol:
1182 case Canychar:
1183 case Cbegbuf:
1184 case Cendbuf:
1185 case Cwordbeg:
1186 case Cwordend:
1187 case Cwordbound:
1188 case Cnotwordbound:
1189#ifdef emacs
1190 case Cemacs_at_dot:
1191#endif /* emacs */
1192 break;
1193 case Cset:
1194 p1 += 256/8;
1195 break;
1196 case Cexact:
1197 case Cstart_memory:
1198 case Cend_memory:
1199 case Cmatch_memory:
1200 case Csyntaxspec:
1201 case Cnotsyntaxspec:
1202 p1++;
1203 break;
1204 case Cjump:
1205 case Cstar_jump:
1206 case Cfailure_jump:
1207 case Cupdate_failure_jump:
1208 case Cdummy_failure_jump:
1209 goto make_normal_jump;
1210 default:
1211 printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
1212 break;
1213 }
1214 }
1215 goto make_update_jump;
1216 }
1217 make_normal_jump:
1218 /* printf("changing to normal jump\n"); */
1219 code -= 3;
1220 *code = Cjump;
1221 break;
1222 make_update_jump:
1223 /* printf("changing to update jump\n"); */
1224 code -= 2;
1225 a += 3; /* jump to after the Cfailure_jump */
1226 code[-1] = Cupdate_failure_jump;
1227 code[0] = a & 0xff;
1228 code[1] = a >> 8;
1229 /* fall to next case */
1230 case Cupdate_failure_jump:
1231 failure_sp[-1].text = text;
1232 failure_sp[-1].partend = partend;
1233 /* fall to next case */
1234 case Cjump:
1235 a = (unsigned char)*code++;
1236 a |= (unsigned char)*code++ << 8;
1237 code += (int)(short)a;
1238 break;
1239 case Cdummy_failure_jump:
1240 case Cfailure_jump:
1241 if (failure_sp == failure_stack_end)
1242 {
1243 if (failure_stack_start != initial_failure_stack)
1244 goto error;
1245 failure_stack_start = (struct failure_point *)
1246 malloc(MAX_FAILURES * sizeof(*failure_stack_start));
1247 failure_stack_end = failure_stack_start + MAX_FAILURES;
1248 memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
1249 INITIAL_FAILURES * sizeof(*failure_stack_start));
1250 failure_sp = failure_stack_start + INITIAL_FAILURES;
1251 }
1252 a = (unsigned char)*code++;
1253 a |= (unsigned char)*code++ << 8;
1254 a = (int)(short)a;
1255 if (code[-3] == Cdummy_failure_jump)
1256 { /* this is only used in plus */
1257 assert(*code == Cfailure_jump);
1258 b = (unsigned char)code[1];
1259 b |= (unsigned char)code[2] << 8;
1260 failure_sp->code = code + (int)(short)b + 3;
1261 failure_sp->text = NULL;
1262 code += a;
1263 }
1264 else
1265 {
1266 failure_sp->code = code + a;
1267 failure_sp->text = text;
1268 failure_sp->partend = partend;
1269 }
1270 failure_sp++;
1271 break;
1272 case Cbegbuf:
1273 if (text == string1)
1274 break;
1275 goto fail;
1276 case Cendbuf:
1277 if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
1278 break;
1279 goto fail;
1280 case Cwordbeg:
1281 if (text == string2 + size2)
1282 goto fail;
1283 if (size2 == 0 && text == string1 + size1)
1284 goto fail;
1285 if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
1286 goto fail;
1287 if (text == string1)
1288 break;
1289 if (SYNTAX(text[-1]) != Sword)
1290 break;
1291 goto fail;
1292 case Cwordend:
1293 if (text == string1)
1294 goto fail;
1295 if (SYNTAX(text[-1]) != Sword)
1296 goto fail;
1297 if (text == string2 + size2)
1298 break;
1299 if (size2 == 0 && text == string1 + size1)
1300 break;
1301 if (SYNTAX(*text) == Sword)
1302 goto fail;
1303 break;
1304 case Cwordbound:
1305 /* Note: as in gnu regexp, this also matches at the beginning
1306 and end of buffer. */
1307 if (text == string1 || text == string2 + size2 ||
1308 (size2 == 0 && text == string1 + size1))
1309 break;
1310 if ((SYNTAX(text[-1]) == Sword) ^
1311 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
1312 break;
1313 goto fail;
1314 case Cnotwordbound:
1315 /* Note: as in gnu regexp, this never matches at the beginning
1316 and end of buffer. */
1317 if (text == string1 || text == string2 + size2 ||
1318 (size2 == 0 && text == string1 + size1))
1319 goto fail;
1320 if (!((SYNTAX(text[-1]) == Sword) ^
1321 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
1322 goto fail;
1323 break;
1324 case Csyntaxspec:
1325 NEXTCHAR(ch);
1326 if (SYNTAX(ch) != (unsigned char)*code++)
1327 goto fail;
1328 break;
1329 case Cnotsyntaxspec:
1330 NEXTCHAR(ch);
1331 if (SYNTAX(ch) != (unsigned char)*code++)
1332 break;
1333 goto fail;
1334#ifdef emacs
1335 case Cemacs_at_dot:
1336 if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
1337 goto fail;
1338 break;
1339#endif /* emacs */
1340 default:
1341 abort();
1342 /*NOTREACHED*/
1343 }
1344 }
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001345#if 0 /* This line is never reached --Guido */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001346 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001347#endif
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001348 /*NOTREACHED*/
1349
1350 fail:
1351 if (failure_sp != failure_stack_start)
1352 {
1353 failure_sp--;
1354 text = failure_sp->text;
1355 if (text == NULL)
1356 goto fail;
1357 partend = failure_sp->partend;
1358 code = failure_sp->code;
1359 goto continue_matching;
1360 }
1361 if (failure_stack_start != initial_failure_stack)
1362 free((char *)failure_stack_start);
1363 return -1;
1364
1365 error:
1366 if (failure_stack_start != initial_failure_stack)
1367 free((char *)failure_stack_start);
1368 return -2;
1369}
1370
1371#undef PREFETCH
1372#undef NEXTCHAR
1373#undef PUSH_FAILURE
1374
1375int re_match(bufp, string, size, pos, regs)
1376regexp_t bufp;
1377char *string;
1378int size, pos;
1379regexp_registers_t regs;
1380{
1381 return re_match_2(bufp, string, size, (char *)NULL, 0, pos, regs, size);
1382}
1383
1384int re_search_2(bufp, string1, size1, string2, size2, pos, range, regs,
1385 mstop)
1386regexp_t bufp;
1387char *string1, *string2;
1388int size1, size2, pos, range, mstop;
1389regexp_registers_t regs;
1390{
1391 char *fastmap, *translate, *text, *partstart, *partend;
1392 int dir, ret;
1393 char anchor;
1394
1395 assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
Guido van Rossum521f81c1992-01-26 18:23:48 +00001396 assert(pos + range >= 0 && pos + range <= size1 + size2); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001397 assert(pos <= mstop);
1398
1399 fastmap = bufp->fastmap;
1400 translate = bufp->translate;
1401 if (fastmap && !bufp->fastmap_accurate)
1402 re_compile_fastmap(bufp);
1403 anchor = bufp->anchor;
1404 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1405 fastmap = NULL;
1406 if (range < 0)
1407 {
1408 dir = -1;
1409 range = -range;
1410 }
1411 else
1412 dir = 1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001413 if (anchor == 2)
1414 if (pos != 0)
1415 return -1;
1416 else
1417 range = 0;
1418 for (; range >= 0; range--, pos += dir)
1419 {
1420 if (fastmap)
1421 {
1422 if (dir == 1)
1423 { /* searching forwards */
1424 if (pos < size1)
1425 {
1426 text = string1 + pos;
1427 if (pos + range > size1)
1428 partend = string1 + size1;
1429 else
1430 partend = string1 + pos + range;
1431 }
1432 else
1433 {
1434 text = string2 + pos - size1;
1435 partend = string2 + pos + range - size1;
1436 }
1437 partstart = text;
1438 if (translate)
1439 while (text != partend &&
1440 !fastmap[(unsigned char)
1441 translate[(unsigned char)*text]])
1442 text++;
1443 else
1444 while (text != partend && !fastmap[(unsigned char)*text])
1445 text++;
1446 pos += text - partstart;
1447 range -= text - partstart;
1448 if (pos == size1 + size2 && bufp->can_be_null == 0)
1449 return -1;
1450 }
1451 else
1452 { /* searching backwards */
1453 if (pos <= size1)
1454 {
1455 text = string1 + pos;
1456 partstart = string1 + pos - range;
1457 }
1458 else
1459 {
1460 text = string2 + pos - size1;
1461 if (range < pos - size1)
1462 partstart = string2 + pos - size1 - range;
1463 else
1464 partstart = string2;
1465 }
1466 partend = text;
1467 if (translate)
1468 while (text != partstart &&
1469 !fastmap[(unsigned char)
1470 translate[(unsigned char)*text]])
1471 text--;
1472 else
1473 while (text != partstart &&
1474 !fastmap[(unsigned char)*text])
1475 text--;
1476 pos -= partend - text;
1477 range -= partend - text;
1478 }
1479 }
1480 if (anchor == 1)
1481 { /* anchored to begline */
1482 if (pos > 0 &&
1483 (pos <= size1 ? string1[pos - 1] :
1484 string2[pos - size1 - 1]) != '\n')
1485 continue;
1486 }
1487 assert(pos >= 0 && pos <= size1 + size2);
1488 ret = re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop);
1489 if (ret >= 0)
1490 return pos;
1491 if (ret == -2)
1492 return -2;
1493 }
1494 return -1;
1495}
1496
1497int re_search(bufp, string, size, startpos, range, regs)
1498regexp_t bufp;
1499char *string;
1500int size, startpos, range;
1501regexp_registers_t regs;
1502{
1503 return re_search_2(bufp, string, size, (char *)NULL, 0,
1504 startpos, range, regs, size);
1505}
1506
1507static struct re_pattern_buffer re_comp_buf;
1508
1509char *re_comp(s)
1510char *s;
1511{
1512 if (s == NULL)
1513 {
1514 if (!re_comp_buf.buffer)
1515 return "Out of memory";
1516 return NULL;
1517 }
1518 if (!re_comp_buf.buffer)
1519 {
1520 /* the buffer will be allocated automatically */
1521 re_comp_buf.fastmap = malloc(256);
1522 re_comp_buf.translate = NULL;
1523 }
1524 return re_compile_pattern(s, strlen(s), &re_comp_buf);
1525}
1526
1527int re_exec(s)
1528char *s;
1529{
1530 int len = strlen(s);
1531
1532 return re_search(&re_comp_buf, s, len, 0, len, (regexp_registers_t)NULL) >= 0;
1533}
1534
1535#ifdef TEST_REGEXP
1536
1537int main()
1538{
1539 char buf[500];
1540 char *cp;
1541 struct re_pattern_buffer exp;
1542 struct re_registers regs;
1543 int a,pos;
1544 char fastmap[256];
1545
1546 exp.allocated = 0;
1547 exp.buffer = 0;
1548 exp.translate = NULL;
1549 exp.fastmap = fastmap;
1550
1551 /* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
1552
1553 while (1)
1554 {
1555 printf("Enter regexp:\n");
1556 gets(buf);
1557 cp=re_compile_pattern(buf, strlen(buf), &exp);
1558 if (cp)
1559 {
1560 printf("Error: %s\n", cp);
1561 continue;
1562 }
1563 re_compile_fastmap(&exp);
1564 printf("dump:\n");
1565 for (pos = 0; pos < exp.used;)
1566 {
1567 printf("%d: ", pos);
1568 switch (exp.buffer[pos++])
1569 {
1570 case Cend:
1571 strcpy(buf, "end");
1572 break;
1573 case Cbol:
1574 strcpy(buf, "bol");
1575 break;
1576 case Ceol:
1577 strcpy(buf, "eol");
1578 break;
1579 case Cset:
1580 strcpy(buf, "set ");
1581 for (a = 0; a < 256/8; a++)
1582 sprintf(buf+strlen(buf)," %02x",
1583 (unsigned char)exp.buffer[pos++]);
1584 break;
1585 case Cexact:
1586 sprintf(buf, "exact '%c' 0x%x", exp.buffer[pos],
1587 (unsigned char)exp.buffer[pos]);
1588 pos++;
1589 break;
1590 case Canychar:
1591 strcpy(buf, "anychar");
1592 break;
1593 case Cstart_memory:
1594 sprintf(buf, "start_memory %d", exp.buffer[pos++]);
1595 break;
1596 case Cend_memory:
1597 sprintf(buf, "end_memory %d", exp.buffer[pos++]);
1598 break;
1599 case Cmatch_memory:
1600 sprintf(buf, "match_memory %d", exp.buffer[pos++]);
1601 break;
1602 case Cjump:
1603 case Cdummy_failure_jump:
1604 case Cstar_jump:
1605 case Cfailure_jump:
1606 case Cupdate_failure_jump:
1607 a = (unsigned char)exp.buffer[pos++];
1608 a += (unsigned char)exp.buffer[pos++] << 8;
1609 a = (int)(short)a;
1610 switch (exp.buffer[pos-3])
1611 {
1612 case Cjump:
1613 cp = "jump";
1614 break;
1615 case Cstar_jump:
1616 cp = "star_jump";
1617 break;
1618 case Cfailure_jump:
1619 cp = "failure_jump";
1620 break;
1621 case Cupdate_failure_jump:
1622 cp = "update_failure_jump";
1623 break;
1624 case Cdummy_failure_jump:
1625 cp = "dummy_failure_jump";
1626 break;
1627 default:
1628 cp = "unknown jump";
1629 break;
1630 }
1631 sprintf(buf, "%s %d", cp, a + pos);
1632 break;
1633 case Cbegbuf:
1634 strcpy(buf,"begbuf");
1635 break;
1636 case Cendbuf:
1637 strcpy(buf,"endbuf");
1638 break;
1639 case Cwordbeg:
1640 strcpy(buf,"wordbeg");
1641 break;
1642 case Cwordend:
1643 strcpy(buf,"wordend");
1644 break;
1645 case Cwordbound:
1646 strcpy(buf,"wordbound");
1647 break;
1648 case Cnotwordbound:
1649 strcpy(buf,"notwordbound");
1650 break;
1651 default:
1652 sprintf(buf, "unknown code %d",
1653 (unsigned char)exp.buffer[pos - 1]);
1654 break;
1655 }
1656 printf("%s\n", buf);
1657 }
1658 printf("can_be_null = %d uses_registers = %d anchor = %d\n",
1659 exp.can_be_null, exp.uses_registers, exp.anchor);
1660
1661 printf("fastmap:");
1662 for (a = 0; a < 256; a++)
1663 if (exp.fastmap[a])
1664 printf(" %d", a);
1665 printf("\n");
1666 printf("Enter strings. An empty line terminates.\n");
1667 while (fgets(buf, sizeof(buf), stdin))
1668 {
1669 if (buf[0] == '\n')
1670 break;
1671 a = re_search(&exp, buf, strlen(buf), 0, strlen(buf), &regs);
1672 printf("search returns %d\n", a);
1673 if (a != -1)
1674 {
1675 for (a = 0; a < RE_NREGS; a++)
1676 {
1677 printf("buf %d: %d to %d\n", a, regs.start[a], regs.end[a]);
1678 }
1679 }
1680 }
1681 }
1682}
1683
1684#endif /* TEST_REGEXP */