blob: 9f03e7c68d7dd8a04c131cbc1a91c5b9be813d69 [file] [log] [blame]
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001/*
2
3regexpr.c
4
5Author: Tatu Ylonen <ylo@ngs.fi>
6
7Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
8
9Permission to use, copy, modify, distribute, and sell this software
10and its documentation for any purpose is hereby granted without fee,
11provided that the above copyright notice appear in all copies. This
12software is provided "as is" without express or implied warranty.
13
14Created: Thu Sep 26 17:14:05 1991 ylo
15Last modified: Mon Nov 4 17:06:48 1991 ylo
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000016Ported to Think C: 19 Jan 1992 guido@cwi.nl
Guido van Rossumb674c3b1992-01-19 16:32:47 +000017
18This code draws many ideas from the regular expression packages by
19Henry Spencer of the University of Toronto and Richard Stallman of the
20Free Software Foundation.
21
22Emacs-specific code and syntax table code is almost directly borrowed
23from GNU regexp.
24
Guido van Rossumb674c3b1992-01-19 16:32:47 +000025*/
26
Guido van Rossum339cfa31996-08-08 19:12:37 +000027#include "config.h" /* For Win* specific redefinition of printf c.s. */
Guido van Rossum339cfa31996-08-08 19:12:37 +000028
Guido van Rossumb6775db1994-08-01 11:34:53 +000029#include "myproto.h" /* For PROTO macro --Guido */
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000030
Guido van Rossumb674c3b1992-01-19 16:32:47 +000031#include <stdio.h>
32#include <assert.h>
33#include "regexpr.h"
34
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000035#ifdef THINK_C
36/* Think C on the Mac really needs these headers... --Guido */
37#include <stdlib.h>
38#include <string.h>
39#else
Guido van Rossum88661e81996-05-23 22:55:58 +000040#if defined(__STDC__) || defined(_MSC_VER)
Guido van Rossum9abc5391992-03-27 17:24:37 +000041/* Don't mess around, use the standard headers */
42#include <stdlib.h>
43#include <string.h>
44#else
Guido van Rossumb674c3b1992-01-19 16:32:47 +000045char *malloc();
46void free();
47char *realloc();
Guido van Rossum9abc5391992-03-27 17:24:37 +000048#endif /* __STDC__ */
49#endif /* THINK_C */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000050
51#define MACRO_BEGIN do {
52#define MACRO_END } while (0)
53
54enum regexp_compiled_ops /* opcodes for compiled regexp */
55{
56 Cend, /* end of pattern reached */
57 Cbol, /* beginning of line */
58 Ceol, /* end of line */
59 Cset, /* character set. Followed by 32 bytes of set. */
60 Cexact, /* followed by a byte to match */
61 Canychar, /* matches any character except newline */
62 Cstart_memory, /* set register start addr (followed by reg number) */
63 Cend_memory, /* set register end addr (followed by reg number) */
64 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
65 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
66 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
67 Cfailure_jump, /* jump to addr on failure */
68 Cupdate_failure_jump, /* update topmost failure point and jump */
69 Cdummy_failure_jump, /* push a dummy failure point and jump */
70 Cbegbuf, /* match at beginning of buffer */
71 Cendbuf, /* match at end of buffer */
72 Cwordbeg, /* match at beginning of word */
73 Cwordend, /* match at end of word */
74 Cwordbound, /* match if at word boundary */
75 Cnotwordbound, /* match if not at word boundary */
76#ifdef emacs
77 Cemacs_at_dot, /* emacs only: matches at dot */
78#endif /* emacs */
79 Csyntaxspec, /* matches syntax code (1 byte follows) */
80 Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
81};
82
83enum regexp_syntax_op /* syntax codes for plain and quoted characters */
84{
85 Rend, /* special code for end of regexp */
86 Rnormal, /* normal character */
87 Ranychar, /* any character except newline */
88 Rquote, /* the quote character */
89 Rbol, /* match beginning of line */
90 Reol, /* match end of line */
91 Roptional, /* match preceding expression optionally */
92 Rstar, /* match preceding expr zero or more times */
93 Rplus, /* match preceding expr one or more times */
94 Ror, /* match either of alternatives */
95 Ropenpar, /* opening parenthesis */
96 Rclosepar, /* closing parenthesis */
97 Rmemory, /* match memory register */
98 Rextended_memory, /* \vnn to match registers 10-99 */
99 Ropenset, /* open set. Internal syntax hard-coded below. */
100 /* the following are gnu extensions to "normal" regexp syntax */
101 Rbegbuf, /* beginning of buffer */
102 Rendbuf, /* end of buffer */
103 Rwordchar, /* word character */
104 Rnotwordchar, /* not word character */
105 Rwordbeg, /* beginning of word */
106 Rwordend, /* end of word */
107 Rwordbound, /* word bound */
108 Rnotwordbound, /* not word bound */
109#ifdef emacs
110 Remacs_at_dot, /* emacs: at dot */
111 Remacs_syntaxspec, /* syntaxspec */
112 Remacs_notsyntaxspec, /* notsyntaxspec */
113#endif /* emacs */
114 Rnum_ops
115};
116
117static int re_compile_initialized = 0;
118static int regexp_syntax = 0;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000119int re_syntax = 0; /* Exported copy of regexp_syntax */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000120static unsigned char regexp_plain_ops[256];
121static unsigned char regexp_quoted_ops[256];
122static unsigned char regexp_precedences[Rnum_ops];
123static int regexp_context_indep_ops;
124static int regexp_ansi_sequences;
125
126#define NUM_LEVELS 5 /* number of precedence levels in use */
127#define MAX_NESTING 100 /* max nesting level of operators */
128
129#ifdef emacs
130
131/* This code is for emacs compatibility only. */
132
133#include "config.h"
134#include "lisp.h"
135#include "buffer.h"
136#include "syntax.h"
137
138/* emacs defines NULL in some strange way? */
139#undef NULL
140#define NULL 0
141
142#else /* emacs */
143
144#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
145#define Sword 1
146
147#ifdef SYNTAX_TABLE
148char *re_syntax_table;
149#else
150static char re_syntax_table[256];
151#endif /* SYNTAX_TABLE */
152
153#endif /* emacs */
154
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000155static void re_compile_initialize PROTO((void));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000156static void re_compile_initialize()
157{
158 int a;
159
160#if !defined(emacs) && !defined(SYNTAX_TABLE)
161 static int syntax_table_inited = 0;
162
163 if (!syntax_table_inited)
164 {
165 syntax_table_inited = 1;
166 memset(re_syntax_table, 0, 256);
167 for (a = 'a'; a <= 'z'; a++)
168 re_syntax_table[a] = Sword;
169 for (a = 'A'; a <= 'Z'; a++)
170 re_syntax_table[a] = Sword;
171 for (a = '0'; a <= '9'; a++)
172 re_syntax_table[a] = Sword;
173 }
174#endif /* !emacs && !SYNTAX_TABLE */
175 re_compile_initialized = 1;
176 for (a = 0; a < 256; a++)
177 {
178 regexp_plain_ops[a] = Rnormal;
179 regexp_quoted_ops[a] = Rnormal;
180 }
181 for (a = '0'; a <= '9'; a++)
182 regexp_quoted_ops[a] = Rmemory;
183 regexp_plain_ops['\134'] = Rquote;
184 if (regexp_syntax & RE_NO_BK_PARENS)
185 {
186 regexp_plain_ops['('] = Ropenpar;
187 regexp_plain_ops[')'] = Rclosepar;
188 }
189 else
190 {
191 regexp_quoted_ops['('] = Ropenpar;
192 regexp_quoted_ops[')'] = Rclosepar;
193 }
194 if (regexp_syntax & RE_NO_BK_VBAR)
195 regexp_plain_ops['\174'] = Ror;
196 else
197 regexp_quoted_ops['\174'] = Ror;
198 regexp_plain_ops['*'] = Rstar;
199 if (regexp_syntax & RE_BK_PLUS_QM)
200 {
201 regexp_quoted_ops['+'] = Rplus;
202 regexp_quoted_ops['?'] = Roptional;
203 }
204 else
205 {
206 regexp_plain_ops['+'] = Rplus;
207 regexp_plain_ops['?'] = Roptional;
208 }
209 if (regexp_syntax & RE_NEWLINE_OR)
210 regexp_plain_ops['\n'] = Ror;
211 regexp_plain_ops['\133'] = Ropenset;
212 regexp_plain_ops['\136'] = Rbol;
213 regexp_plain_ops['$'] = Reol;
214 regexp_plain_ops['.'] = Ranychar;
215 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
216 {
217#ifdef emacs
218 regexp_quoted_ops['='] = Remacs_at_dot;
219 regexp_quoted_ops['s'] = Remacs_syntaxspec;
220 regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
221#endif /* emacs */
222 regexp_quoted_ops['w'] = Rwordchar;
223 regexp_quoted_ops['W'] = Rnotwordchar;
224 regexp_quoted_ops['<'] = Rwordbeg;
225 regexp_quoted_ops['>'] = Rwordend;
226 regexp_quoted_ops['b'] = Rwordbound;
227 regexp_quoted_ops['B'] = Rnotwordbound;
228 regexp_quoted_ops['`'] = Rbegbuf;
229 regexp_quoted_ops['\''] = Rendbuf;
230 }
231 if (regexp_syntax & RE_ANSI_HEX)
232 regexp_quoted_ops['v'] = Rextended_memory;
233 for (a = 0; a < Rnum_ops; a++)
234 regexp_precedences[a] = 4;
235 if (regexp_syntax & RE_TIGHT_VBAR)
236 {
237 regexp_precedences[Ror] = 3;
238 regexp_precedences[Rbol] = 2;
239 regexp_precedences[Reol] = 2;
240 }
241 else
242 {
243 regexp_precedences[Ror] = 2;
244 regexp_precedences[Rbol] = 3;
245 regexp_precedences[Reol] = 3;
246 }
247 regexp_precedences[Rclosepar] = 1;
248 regexp_precedences[Rend] = 0;
249 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
250 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
251}
252
253int re_set_syntax(syntax)
254int syntax;
255{
256 int ret;
257
258 ret = regexp_syntax;
259 regexp_syntax = syntax;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000260 re_syntax = syntax; /* Exported copy */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000261 re_compile_initialize();
262 return ret;
263}
264
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000265static int hex_char_to_decimal PROTO((int));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000266static int hex_char_to_decimal(ch)
267int ch;
268{
269 if (ch >= '0' && ch <= '9')
270 return ch - '0';
271 if (ch >= 'a' && ch <= 'f')
272 return ch - 'a' + 10;
273 if (ch >= 'A' && ch <= 'F')
274 return ch - 'A' + 10;
275 return 16;
276}
277
278char *re_compile_pattern(regex, size, bufp)
279char *regex;
280int size;
281regexp_t bufp;
282{
283 int a, pos, op, current_level, level, opcode;
Guido van Rossuma376cc51996-12-05 23:43:35 +0000284 int pattern_offset = 0, alloc;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000285 int starts[NUM_LEVELS * MAX_NESTING], starts_base;
286 int future_jumps[MAX_NESTING], num_jumps;
Guido van Rossuma376cc51996-12-05 23:43:35 +0000287 unsigned char ch = '\0';
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000288 char *pattern, *translate;
289 int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
290 int beginning_context;
291
292#define NEXTCHAR(var) \
293 MACRO_BEGIN \
294 if (pos >= size) \
295 goto ends_prematurely; \
296 (var) = regex[pos]; \
297 pos++; \
298 MACRO_END
299
300#define ALLOC(amount) \
301 MACRO_BEGIN \
302 if (pattern_offset+(amount) > alloc) \
303 { \
304 alloc += 256 + (amount); \
305 pattern = realloc(pattern, alloc); \
306 if (!pattern) \
307 goto out_of_memory; \
308 } \
309 MACRO_END
310
311#define STORE(ch) pattern[pattern_offset++] = (ch)
312
313#define CURRENT_LEVEL_START (starts[starts_base + current_level])
314
315#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
316
317#define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
318 starts_base += NUM_LEVELS; \
319 else \
320 goto too_complex
321
322#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
323
324#define PUT_ADDR(offset,addr) \
325 MACRO_BEGIN \
326 int disp = (addr) - (offset) - 2; \
327 pattern[(offset)] = disp & 0xff; \
328 pattern[(offset)+1] = (disp>>8) & 0xff; \
329 MACRO_END
330
331#define INSERT_JUMP(pos,type,addr) \
332 MACRO_BEGIN \
333 int a, p = (pos), t = (type), ad = (addr); \
334 for (a = pattern_offset - 1; a >= p; a--) \
335 pattern[a + 3] = pattern[a]; \
336 pattern[p] = t; \
337 PUT_ADDR(p+1,ad); \
338 pattern_offset += 3; \
339 MACRO_END
340
341#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
342
343#define SET_FIELDS \
344 MACRO_BEGIN \
345 bufp->allocated = alloc; \
346 bufp->buffer = pattern; \
347 bufp->used = pattern_offset; \
348 MACRO_END
349
350#define GETHEX(var) \
351 MACRO_BEGIN \
352 char gethex_ch, gethex_value; \
353 NEXTCHAR(gethex_ch); \
354 gethex_value = hex_char_to_decimal(gethex_ch); \
355 if (gethex_value == 16) \
356 goto hex_error; \
357 NEXTCHAR(gethex_ch); \
358 gethex_ch = hex_char_to_decimal(gethex_ch); \
359 if (gethex_ch == 16) \
360 goto hex_error; \
361 (var) = gethex_value * 16 + gethex_ch; \
362 MACRO_END
363
364#define ANSI_TRANSLATE(ch) \
365 MACRO_BEGIN \
366 switch (ch) \
367 { \
368 case 'a': \
369 case 'A': \
370 ch = 7; /* audible bell */ \
371 break; \
372 case 'b': \
373 case 'B': \
374 ch = 8; /* backspace */ \
375 break; \
376 case 'f': \
377 case 'F': \
378 ch = 12; /* form feed */ \
379 break; \
380 case 'n': \
381 case 'N': \
382 ch = 10; /* line feed */ \
383 break; \
384 case 'r': \
385 case 'R': \
386 ch = 13; /* carriage return */ \
387 break; \
388 case 't': \
389 case 'T': \
390 ch = 9; /* tab */ \
391 break; \
392 case 'v': \
393 case 'V': \
394 ch = 11; /* vertical tab */ \
395 break; \
396 case 'x': /* hex code */ \
397 case 'X': \
398 GETHEX(ch); \
399 break; \
400 default: \
401 /* other characters passed through */ \
402 if (translate) \
403 ch = translate[(unsigned char)ch]; \
404 break; \
405 } \
406 MACRO_END
407
408 if (!re_compile_initialized)
409 re_compile_initialize();
410 bufp->used = 0;
411 bufp->fastmap_accurate = 0;
412 bufp->uses_registers = 0;
413 translate = bufp->translate;
414 pattern = bufp->buffer;
415 alloc = bufp->allocated;
416 if (alloc == 0 || pattern == NULL)
417 {
418 alloc = 256;
419 pattern = malloc(alloc);
420 if (!pattern)
421 goto out_of_memory;
422 }
423 pattern_offset = 0;
424 starts_base = 0;
425 num_jumps = 0;
426 current_level = 0;
427 SET_LEVEL_START;
428 num_open_registers = 0;
429 next_register = 1;
430 paren_depth = 0;
431 beginning_context = 1;
432 op = -1;
433 /* we use Rend dummy to ensure that pending jumps are updated (due to
434 low priority of Rend) before exiting the loop. */
435 pos = 0;
436 while (op != Rend)
437 {
438 if (pos >= size)
439 op = Rend;
440 else
441 {
442 NEXTCHAR(ch);
443 if (translate)
444 ch = translate[(unsigned char)ch];
445 op = regexp_plain_ops[(unsigned char)ch];
446 if (op == Rquote)
447 {
448 NEXTCHAR(ch);
449 op = regexp_quoted_ops[(unsigned char)ch];
450 if (op == Rnormal && regexp_ansi_sequences)
451 ANSI_TRANSLATE(ch);
452 }
453 }
454 level = regexp_precedences[op];
455 /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
456 ch, op, level, current_level, CURRENT_LEVEL_START); */
457 if (level > current_level)
458 {
459 for (current_level++; current_level < level; current_level++)
460 SET_LEVEL_START;
461 SET_LEVEL_START;
462 }
463 else
464 if (level < current_level)
465 {
466 current_level = level;
467 for (;num_jumps > 0 &&
468 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
469 num_jumps--)
470 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
471 }
472 switch (op)
473 {
474 case Rend:
475 break;
476 case Rnormal:
477 normal_char:
478 opcode = Cexact;
479 store_opcode_and_arg: /* opcode & ch must be set */
480 SET_LEVEL_START;
481 ALLOC(2);
482 STORE(opcode);
483 STORE(ch);
484 break;
485 case Ranychar:
486 opcode = Canychar;
487 store_opcode:
488 SET_LEVEL_START;
489 ALLOC(1);
490 STORE(opcode);
491 break;
492 case Rquote:
493 abort();
494 /*NOTREACHED*/
495 case Rbol:
496 if (!beginning_context)
497 if (regexp_context_indep_ops)
498 goto op_error;
499 else
500 goto normal_char;
501 opcode = Cbol;
502 goto store_opcode;
503 case Reol:
504 if (!((pos >= size) ||
505 ((regexp_syntax & RE_NO_BK_VBAR) ?
506 (regex[pos] == '\174') :
507 (pos+1 < size && regex[pos] == '\134' &&
508 regex[pos+1] == '\174')) ||
509 ((regexp_syntax & RE_NO_BK_PARENS)?
510 (regex[pos] == ')'):
511 (pos+1 < size && regex[pos] == '\134' &&
512 regex[pos+1] == ')'))))
513 if (regexp_context_indep_ops)
514 goto op_error;
515 else
516 goto normal_char;
517 opcode = Ceol;
518 goto store_opcode;
Guido van Rossum9abc5391992-03-27 17:24:37 +0000519 /* NOTREACHED */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000520 break;
521 case Roptional:
522 if (beginning_context)
523 if (regexp_context_indep_ops)
524 goto op_error;
525 else
526 goto normal_char;
527 if (CURRENT_LEVEL_START == pattern_offset)
528 break; /* ignore empty patterns for ? */
529 ALLOC(3);
530 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
531 pattern_offset + 3);
532 break;
533 case Rstar:
534 case Rplus:
535 if (beginning_context)
536 if (regexp_context_indep_ops)
537 goto op_error;
538 else
539 goto normal_char;
540 if (CURRENT_LEVEL_START == pattern_offset)
541 break; /* ignore empty patterns for + and * */
542 ALLOC(9);
543 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
544 pattern_offset + 6);
545 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
546 if (op == Rplus) /* jump over initial failure_jump */
547 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
548 CURRENT_LEVEL_START + 6);
549 break;
550 case Ror:
551 ALLOC(6);
552 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
553 pattern_offset + 6);
554 if (num_jumps >= MAX_NESTING)
555 goto too_complex;
556 STORE(Cjump);
557 future_jumps[num_jumps++] = pattern_offset;
558 STORE(0);
559 STORE(0);
560 SET_LEVEL_START;
561 break;
562 case Ropenpar:
563 SET_LEVEL_START;
564 if (next_register < RE_NREGS)
565 {
566 bufp->uses_registers = 1;
567 ALLOC(2);
568 STORE(Cstart_memory);
569 STORE(next_register);
570 open_registers[num_open_registers++] = next_register;
571 next_register++;
572 }
573 paren_depth++;
574 PUSH_LEVEL_STARTS;
575 current_level = 0;
576 SET_LEVEL_START;
577 break;
578 case Rclosepar:
579 if (paren_depth <= 0)
580 goto parenthesis_error;
581 POP_LEVEL_STARTS;
582 current_level = regexp_precedences[Ropenpar];
583 paren_depth--;
584 if (paren_depth < num_open_registers)
585 {
586 bufp->uses_registers = 1;
587 ALLOC(2);
588 STORE(Cend_memory);
589 num_open_registers--;
590 STORE(open_registers[num_open_registers]);
591 }
592 break;
593 case Rmemory:
594 if (ch == '0')
595 goto bad_match_register;
596 assert(ch >= '0' && ch <= '9');
597 bufp->uses_registers = 1;
598 opcode = Cmatch_memory;
599 ch -= '0';
600 goto store_opcode_and_arg;
601 case Rextended_memory:
602 NEXTCHAR(ch);
603 if (ch < '0' || ch > '9')
604 goto bad_match_register;
605 NEXTCHAR(a);
606 if (a < '0' || a > '9')
607 goto bad_match_register;
608 ch = 10 * (a - '0') + ch - '0';
609 if (ch <= 0 || ch >= RE_NREGS)
610 goto bad_match_register;
611 bufp->uses_registers = 1;
612 opcode = Cmatch_memory;
613 goto store_opcode_and_arg;
614 case Ropenset:
615 {
616 int complement,prev,offset,range,firstchar;
617
618 SET_LEVEL_START;
619 ALLOC(1+256/8);
620 STORE(Cset);
621 offset = pattern_offset;
622 for (a = 0; a < 256/8; a++)
623 STORE(0);
624 NEXTCHAR(ch);
625 if (translate)
626 ch = translate[(unsigned char)ch];
627 if (ch == '\136')
628 {
629 complement = 1;
630 NEXTCHAR(ch);
631 if (translate)
632 ch = translate[(unsigned char)ch];
633 }
634 else
635 complement = 0;
636 prev = -1;
637 range = 0;
638 firstchar = 1;
639 while (ch != '\135' || firstchar)
640 {
641 firstchar = 0;
642 if (regexp_ansi_sequences && ch == '\134')
643 {
644 NEXTCHAR(ch);
645 ANSI_TRANSLATE(ch);
646 }
647 if (range)
648 {
Guido van Rossumb6775db1994-08-01 11:34:53 +0000649 for (a = prev; a <= (int)ch; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000650 SETBIT(pattern, offset, a);
651 prev = -1;
652 range = 0;
653 }
654 else
655 if (prev != -1 && ch == '-')
656 range = 1;
657 else
658 {
659 SETBIT(pattern, offset, ch);
660 prev = ch;
661 }
662 NEXTCHAR(ch);
663 if (translate)
664 ch = translate[(unsigned char)ch];
665 }
666 if (range)
667 SETBIT(pattern, offset, '-');
668 if (complement)
669 {
670 for (a = 0; a < 256/8; a++)
671 pattern[offset+a] ^= 0xff;
672 }
673 break;
674 }
675 case Rbegbuf:
676 opcode = Cbegbuf;
677 goto store_opcode;
678 case Rendbuf:
679 opcode = Cendbuf;
680 goto store_opcode;
681 case Rwordchar:
682 opcode = Csyntaxspec;
683 ch = Sword;
684 goto store_opcode_and_arg;
685 case Rnotwordchar:
686 opcode = Cnotsyntaxspec;
687 ch = Sword;
688 goto store_opcode_and_arg;
689 case Rwordbeg:
690 opcode = Cwordbeg;
691 goto store_opcode;
692 case Rwordend:
693 opcode = Cwordend;
694 goto store_opcode;
695 case Rwordbound:
696 opcode = Cwordbound;
697 goto store_opcode;
698 case Rnotwordbound:
699 opcode = Cnotwordbound;
700 goto store_opcode;
701#ifdef emacs
702 case Remacs_at_dot:
703 opcode = Cemacs_at_dot;
704 goto store_opcode;
705 case Remacs_syntaxspec:
706 NEXTCHAR(ch);
707 if (translate)
708 ch = translate[(unsigned char)ch];
709 opcode = Csyntaxspec;
710 ch = syntax_spec_code[(unsigned char)ch];
711 goto store_opcode_and_arg;
712 case Remacs_notsyntaxspec:
713 NEXTCHAR(ch);
714 if (translate)
715 ch = translate[(unsigned char)ch];
716 opcode = Cnotsyntaxspec;
717 ch = syntax_spec_code[(unsigned char)ch];
718 goto store_opcode_and_arg;
719#endif /* emacs */
720 default:
721 abort();
722 }
723 beginning_context = (op == Ropenpar || op == Ror);
724 }
725 if (starts_base != 0)
726 goto parenthesis_error;
727 assert(num_jumps == 0);
728 ALLOC(1);
729 STORE(Cend);
730 SET_FIELDS;
731 return NULL;
732
733 op_error:
734 SET_FIELDS;
735 return "Badly placed special character";
736
737 bad_match_register:
738 SET_FIELDS;
739 return "Bad match register number";
740
741 hex_error:
742 SET_FIELDS;
743 return "Bad hexadecimal number";
744
745 parenthesis_error:
746 SET_FIELDS;
747 return "Badly placed parenthesis";
748
749 out_of_memory:
750 SET_FIELDS;
751 return "Out of memory";
752
753 ends_prematurely:
754 SET_FIELDS;
755 return "Regular expression ends prematurely";
756
757 too_complex:
758 SET_FIELDS;
759 return "Regular expression too complex";
760}
761#undef CHARAT
762#undef NEXTCHAR
763#undef GETHEX
764#undef ALLOC
765#undef STORE
766#undef CURRENT_LEVEL_START
767#undef SET_LEVEL_START
768#undef PUSH_LEVEL_STARTS
769#undef POP_LEVEL_STARTS
770#undef PUT_ADDR
771#undef INSERT_JUMP
772#undef SETBIT
773#undef SET_FIELDS
774
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000775static void re_compile_fastmap_aux
776 PROTO((char *, int, char *, char *, char *));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000777static void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
778char *code, *visited, *can_be_null, *fastmap;
779int pos;
780{
781 int a, b, syntaxcode;
782
783 if (visited[pos])
784 return; /* we have already been here */
785 visited[pos] = 1;
786 for (;;)
787 switch (code[pos++])
788 {
789 case Cend:
790 *can_be_null = 1;
791 return;
792 case Cbol:
793 case Cbegbuf:
794 case Cendbuf:
795 case Cwordbeg:
796 case Cwordend:
797 case Cwordbound:
798 case Cnotwordbound:
799#ifdef emacs
800 case Cemacs_at_dot:
801#endif /* emacs */
802 break;
803 case Csyntaxspec:
804 syntaxcode = code[pos++];
805 for (a = 0; a < 256; a++)
806 if (SYNTAX(a) == syntaxcode)
807 fastmap[a] = 1;
808 return;
809 case Cnotsyntaxspec:
810 syntaxcode = code[pos++];
811 for (a = 0; a < 256; a++)
812 if (SYNTAX(a) != syntaxcode)
813 fastmap[a] = 1;
814 return;
815 case Ceol:
816 fastmap['\n'] = 1;
817 if (*can_be_null == 0)
818 *can_be_null = 2; /* can match null, but only at end of buffer*/
819 return;
820 case Cset:
821 for (a = 0; a < 256/8; a++)
822 if (code[pos + a] != 0)
823 for (b = 0; b < 8; b++)
824 if (code[pos + a] & (1 << b))
825 fastmap[(a << 3) + b] = 1;
826 pos += 256/8;
827 return;
828 case Cexact:
829 fastmap[(unsigned char)code[pos]] = 1;
830 return;
831 case Canychar:
832 for (a = 0; a < 256; a++)
833 if (a != '\n')
834 fastmap[a] = 1;
835 return;
836 case Cstart_memory:
837 case Cend_memory:
838 pos++;
839 break;
840 case Cmatch_memory:
841 /* should this ever happen for sensible patterns??? */
842 *can_be_null = 1;
843 return;
844 case Cjump:
845 case Cdummy_failure_jump:
846 case Cupdate_failure_jump:
847 case Cstar_jump:
848 a = (unsigned char)code[pos++];
849 a |= (unsigned char)code[pos++] << 8;
850 pos += (int)(short)a;
851 if (visited[pos])
852 {
853 /* argh... the regexp contains empty loops. This is not
854 good, as this may cause a failure stack overflow when
855 matching. Oh well. */
856 /* this path leads nowhere; pursue other paths. */
857 return;
858 }
859 visited[pos] = 1;
860 break;
861 case Cfailure_jump:
862 a = (unsigned char)code[pos++];
863 a |= (unsigned char)code[pos++] << 8;
864 a = pos + (int)(short)a;
865 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
866 break;
867 default:
868 abort(); /* probably some opcode is missing from this switch */
869 /*NOTREACHED*/
870 }
871}
872
Guido van Rossum3b1a57a1992-01-27 16:47:46 +0000873static int re_do_compile_fastmap PROTO((char *, int, int, char *, char *));
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000874static int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
875char *buffer, *fastmap, *can_be_null;
876int used, pos;
877{
878 char small_visited[512], *visited;
879
880 if (used <= sizeof(small_visited))
881 visited = small_visited;
882 else
883 {
884 visited = malloc(used);
885 if (!visited)
886 return 0;
887 }
888 *can_be_null = 0;
889 memset(fastmap, 0, 256);
890 memset(visited, 0, used);
891 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
892 if (visited != small_visited)
893 free(visited);
894 return 1;
895}
896
897void re_compile_fastmap(bufp)
898regexp_t bufp;
899{
900 if (!bufp->fastmap || bufp->fastmap_accurate)
901 return;
902 assert(bufp->used > 0);
903 if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
904 bufp->fastmap))
905 return;
906 if (bufp->buffer[0] == Cbol)
907 bufp->anchor = 1; /* begline */
908 else
909 if (bufp->buffer[0] == Cbegbuf)
910 bufp->anchor = 2; /* begbuf */
911 else
912 bufp->anchor = 0; /* none */
913 bufp->fastmap_accurate = 1;
914}
915
916#define INITIAL_FAILURES 128 /* initial # failure points to allocate */
Guido van Rossum88661e81996-05-23 22:55:58 +0000917#define MAX_FAILURES 4100L /* max # of failure points before failing */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000918
919int re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop)
920regexp_t bufp;
921char *string1, *string2;
922int size1, size2, pos, mstop;
923regexp_registers_t regs;
924{
925 struct failure_point { char *text, *partend, *code; }
926 *failure_stack_start, *failure_sp, *failure_stack_end,
927 initial_failure_stack[INITIAL_FAILURES];
928 char *code, *translate, *text, *textend, *partend, *part_2_end;
929 char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
930 char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
931 int a, b, ch, reg, regch, match_end;
932 char *regtext, *regpartend, *regtextend;
933
934#define PREFETCH \
935 MACRO_BEGIN \
936 if (text == partend) \
937 { \
938 if (text == textend) \
939 goto fail; \
940 text = string2; \
941 partend = part_2_end; \
942 } \
943 MACRO_END
944
945#define NEXTCHAR(var) \
946 MACRO_BEGIN \
947 PREFETCH; \
948 (var) = (unsigned char)*text++; \
949 if (translate) \
950 (var) = (unsigned char)translate[(var)]; \
951 MACRO_END
952
953 assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
954 assert(mstop <= size1 + size2);
955 assert(pos <= mstop);
956
957 if (pos <= size1)
958 {
959 text = string1 + pos;
960 if (mstop <= size1)
961 {
962 partend = string1 + mstop;
963 textend = partend;
964 }
965 else
966 {
967 partend = string1 + size1;
968 textend = string2 + mstop - size1;
969 }
970 part_2_end = string2 + mstop - size1;
971 }
972 else
973 {
974 text = string2 + pos - size1;
975 partend = string2 + mstop - size1;
976 textend = partend;
977 part_2_end = partend;
978 }
979
980 if (bufp->uses_registers && regs != NULL)
981 for (a = 0; a < RE_NREGS; a++)
982 regend_text[a] = NULL;
983
984 code = bufp->buffer;
985 translate = bufp->translate;
986 failure_stack_start = failure_sp = initial_failure_stack;
987 failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
988
989#if 0
990 /* re_search_2 has already done this, and otherwise we get little benefit
991 from this. So I'll leave this out. */
992 if (bufp->fastmap_accurate && !bufp->can_be_null &&
993 text != textend &&
994 !bufp->fastmap[translate ?
995 (unsigned char)translate[(unsigned char)*text] :
996 (unsigned char)*text])
997 return -1; /* it can't possibly match */
998#endif
999
1000 continue_matching:
1001 for (;;)
1002 {
1003 switch (*code++)
1004 {
1005 case Cend:
1006 if (partend != part_2_end)
1007 match_end = text - string1;
1008 else
1009 match_end = text - string2 + size1;
1010 if (regs)
1011 {
1012 regs->start[0] = pos;
1013 regs->end[0] = match_end;
1014 if (!bufp->uses_registers)
1015 {
1016 for (a = 1; a < RE_NREGS; a++)
1017 {
1018 regs->start[a] = -1;
1019 regs->end[a] = -1;
1020 }
1021 }
1022 else
1023 {
1024 for (a = 1; a < RE_NREGS; a++)
1025 {
1026 if (regend_text[a] == NULL)
1027 {
1028 regs->start[a] = -1;
1029 regs->end[a] = -1;
1030 continue;
1031 }
1032 if (regstart_partend[a] != part_2_end)
1033 regs->start[a] = regstart_text[a] - string1;
1034 else
1035 regs->start[a] = regstart_text[a] - string2 + size1;
1036 if (regend_partend[a] != part_2_end)
1037 regs->end[a] = regend_text[a] - string1;
1038 else
1039 regs->end[a] = regend_text[a] - string2 + size1;
1040 }
1041 }
1042 }
1043 if (failure_stack_start != initial_failure_stack)
1044 free((char *)failure_stack_start);
1045 return match_end - pos;
1046 case Cbol:
1047 if (text == string1 || text[-1] == '\n') /* text[-1] always valid */
1048 break;
1049 goto fail;
1050 case Ceol:
1051 if (text == string2 + size2 ||
1052 (text == string1 + size1 ?
1053 (size2 == 0 || *string2 == '\n') :
1054 *text == '\n'))
1055 break;
1056 goto fail;
1057 case Cset:
1058 NEXTCHAR(ch);
1059 if (code[ch/8] & (1<<(ch & 7)))
1060 {
1061 code += 256/8;
1062 break;
1063 }
1064 goto fail;
1065 case Cexact:
1066 NEXTCHAR(ch);
1067 if (ch != (unsigned char)*code++)
1068 goto fail;
1069 break;
1070 case Canychar:
1071 NEXTCHAR(ch);
1072 if (ch == '\n')
1073 goto fail;
1074 break;
1075 case Cstart_memory:
1076 reg = *code++;
1077 regstart_text[reg] = text;
1078 regstart_partend[reg] = partend;
1079 break;
1080 case Cend_memory:
1081 reg = *code++;
1082 regend_text[reg] = text;
1083 regend_partend[reg] = partend;
1084 break;
1085 case Cmatch_memory:
1086 reg = *code++;
1087 if (regend_text[reg] == NULL)
1088 goto fail; /* or should we just match nothing? */
1089 regtext = regstart_text[reg];
1090 regtextend = regend_text[reg];
1091 if (regstart_partend[reg] == regend_partend[reg])
1092 regpartend = regtextend;
1093 else
1094 regpartend = string1 + size1;
1095
1096 for (;regtext != regtextend;)
1097 {
1098 NEXTCHAR(ch);
1099 if (regtext == regpartend)
1100 regtext = string2;
1101 regch = (unsigned char)*regtext++;
1102 if (translate)
1103 regch = (unsigned char)translate[regch];
1104 if (regch != ch)
1105 goto fail;
1106 }
1107 break;
1108 case Cstar_jump:
1109 /* star is coded as:
1110 1: failure_jump 2
1111 ... code for operand of star
1112 star_jump 1
1113 2: ... code after star
1114 We change the star_jump to update_failure_jump if we can determine
1115 that it is safe to do so; otherwise we change it to an ordinary
1116 jump.
1117 plus is coded as
1118 jump 2
1119 1: failure_jump 3
1120 2: ... code for operand of plus
1121 star_jump 1
1122 3: ... code after plus
1123 For star_jump considerations this is processed identically
1124 to star. */
1125 a = (unsigned char)*code++;
1126 a |= (unsigned char)*code++ << 8;
1127 a = (int)(short)a;
1128 {
1129 char map[256], can_be_null;
1130 char *p1, *p2;
1131
1132 p1 = code + a + 3; /* skip the failure_jump */
1133 assert(p1[-3] == Cfailure_jump);
1134 p2 = code;
1135 /* p1 points inside loop, p2 points to after loop */
1136 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
1137 p2 - bufp->buffer, &can_be_null, map))
1138 goto make_normal_jump;
1139 /* If we might introduce a new update point inside the loop,
1140 we can't optimize because then update_jump would update a
1141 wrong failure point. Thus we have to be quite careful here. */
1142 loop_p1:
1143 /* loop until we find something that consumes a character */
1144 switch (*p1++)
1145 {
1146 case Cbol:
1147 case Ceol:
1148 case Cbegbuf:
1149 case Cendbuf:
1150 case Cwordbeg:
1151 case Cwordend:
1152 case Cwordbound:
1153 case Cnotwordbound:
1154#ifdef emacs
1155 case Cemacs_at_dot:
1156#endif /* emacs */
1157 goto loop_p1;
1158 case Cstart_memory:
1159 case Cend_memory:
1160 p1++;
1161 goto loop_p1;
1162 case Cexact:
1163 ch = (unsigned char)*p1++;
1164 if (map[ch])
1165 goto make_normal_jump;
1166 break;
1167 case Canychar:
1168 for (b = 0; b < 256; b++)
1169 if (b != '\n' && map[b])
1170 goto make_normal_jump;
1171 break;
1172 case Cset:
1173 for (b = 0; b < 256; b++)
1174 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
1175 goto make_normal_jump;
1176 p1 += 256/8;
1177 break;
1178 default:
1179 goto make_normal_jump;
1180 }
1181 /* now we know that we can't backtrack. */
1182 while (p1 != p2 - 3)
1183 {
1184 switch (*p1++)
1185 {
1186 case Cend:
1187 abort(); /* we certainly shouldn't get this inside loop */
1188 /*NOTREACHED*/
1189 case Cbol:
1190 case Ceol:
1191 case Canychar:
1192 case Cbegbuf:
1193 case Cendbuf:
1194 case Cwordbeg:
1195 case Cwordend:
1196 case Cwordbound:
1197 case Cnotwordbound:
1198#ifdef emacs
1199 case Cemacs_at_dot:
1200#endif /* emacs */
1201 break;
1202 case Cset:
1203 p1 += 256/8;
1204 break;
1205 case Cexact:
1206 case Cstart_memory:
1207 case Cend_memory:
1208 case Cmatch_memory:
1209 case Csyntaxspec:
1210 case Cnotsyntaxspec:
1211 p1++;
1212 break;
1213 case Cjump:
1214 case Cstar_jump:
1215 case Cfailure_jump:
1216 case Cupdate_failure_jump:
1217 case Cdummy_failure_jump:
1218 goto make_normal_jump;
1219 default:
1220 printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
1221 break;
1222 }
1223 }
1224 goto make_update_jump;
1225 }
1226 make_normal_jump:
1227 /* printf("changing to normal jump\n"); */
1228 code -= 3;
1229 *code = Cjump;
1230 break;
1231 make_update_jump:
1232 /* printf("changing to update jump\n"); */
1233 code -= 2;
1234 a += 3; /* jump to after the Cfailure_jump */
1235 code[-1] = Cupdate_failure_jump;
1236 code[0] = a & 0xff;
1237 code[1] = a >> 8;
1238 /* fall to next case */
1239 case Cupdate_failure_jump:
1240 failure_sp[-1].text = text;
1241 failure_sp[-1].partend = partend;
1242 /* fall to next case */
1243 case Cjump:
1244 a = (unsigned char)*code++;
1245 a |= (unsigned char)*code++ << 8;
1246 code += (int)(short)a;
1247 break;
1248 case Cdummy_failure_jump:
1249 case Cfailure_jump:
1250 if (failure_sp == failure_stack_end)
1251 {
1252 if (failure_stack_start != initial_failure_stack)
1253 goto error;
1254 failure_stack_start = (struct failure_point *)
1255 malloc(MAX_FAILURES * sizeof(*failure_stack_start));
Guido van Rossumc65a5251994-08-05 13:44:50 +00001256 if (failure_stack_start == NULL)
1257 {
1258 failure_stack_start = initial_failure_stack;
1259 goto error;
1260 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001261 failure_stack_end = failure_stack_start + MAX_FAILURES;
1262 memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
1263 INITIAL_FAILURES * sizeof(*failure_stack_start));
1264 failure_sp = failure_stack_start + INITIAL_FAILURES;
1265 }
1266 a = (unsigned char)*code++;
1267 a |= (unsigned char)*code++ << 8;
1268 a = (int)(short)a;
1269 if (code[-3] == Cdummy_failure_jump)
1270 { /* this is only used in plus */
1271 assert(*code == Cfailure_jump);
1272 b = (unsigned char)code[1];
1273 b |= (unsigned char)code[2] << 8;
1274 failure_sp->code = code + (int)(short)b + 3;
1275 failure_sp->text = NULL;
1276 code += a;
1277 }
1278 else
1279 {
1280 failure_sp->code = code + a;
1281 failure_sp->text = text;
1282 failure_sp->partend = partend;
1283 }
1284 failure_sp++;
1285 break;
1286 case Cbegbuf:
1287 if (text == string1)
1288 break;
1289 goto fail;
1290 case Cendbuf:
1291 if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
1292 break;
1293 goto fail;
1294 case Cwordbeg:
1295 if (text == string2 + size2)
1296 goto fail;
1297 if (size2 == 0 && text == string1 + size1)
1298 goto fail;
1299 if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
1300 goto fail;
1301 if (text == string1)
1302 break;
1303 if (SYNTAX(text[-1]) != Sword)
1304 break;
1305 goto fail;
1306 case Cwordend:
1307 if (text == string1)
1308 goto fail;
1309 if (SYNTAX(text[-1]) != Sword)
1310 goto fail;
1311 if (text == string2 + size2)
1312 break;
1313 if (size2 == 0 && text == string1 + size1)
1314 break;
1315 if (SYNTAX(*text) == Sword)
1316 goto fail;
1317 break;
1318 case Cwordbound:
1319 /* Note: as in gnu regexp, this also matches at the beginning
1320 and end of buffer. */
1321 if (text == string1 || text == string2 + size2 ||
1322 (size2 == 0 && text == string1 + size1))
1323 break;
1324 if ((SYNTAX(text[-1]) == Sword) ^
1325 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
1326 break;
1327 goto fail;
1328 case Cnotwordbound:
1329 /* Note: as in gnu regexp, this never matches at the beginning
1330 and end of buffer. */
1331 if (text == string1 || text == string2 + size2 ||
1332 (size2 == 0 && text == string1 + size1))
1333 goto fail;
1334 if (!((SYNTAX(text[-1]) == Sword) ^
1335 (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
1336 goto fail;
1337 break;
1338 case Csyntaxspec:
1339 NEXTCHAR(ch);
1340 if (SYNTAX(ch) != (unsigned char)*code++)
1341 goto fail;
1342 break;
1343 case Cnotsyntaxspec:
1344 NEXTCHAR(ch);
1345 if (SYNTAX(ch) != (unsigned char)*code++)
1346 break;
1347 goto fail;
1348#ifdef emacs
1349 case Cemacs_at_dot:
1350 if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
1351 goto fail;
1352 break;
1353#endif /* emacs */
1354 default:
1355 abort();
1356 /*NOTREACHED*/
1357 }
1358 }
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001359#if 0 /* This line is never reached --Guido */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001360 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001361#endif
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001362 /*NOTREACHED*/
1363
1364 fail:
1365 if (failure_sp != failure_stack_start)
1366 {
1367 failure_sp--;
1368 text = failure_sp->text;
1369 if (text == NULL)
1370 goto fail;
1371 partend = failure_sp->partend;
1372 code = failure_sp->code;
1373 goto continue_matching;
1374 }
1375 if (failure_stack_start != initial_failure_stack)
1376 free((char *)failure_stack_start);
1377 return -1;
1378
1379 error:
1380 if (failure_stack_start != initial_failure_stack)
1381 free((char *)failure_stack_start);
1382 return -2;
1383}
1384
1385#undef PREFETCH
1386#undef NEXTCHAR
1387#undef PUSH_FAILURE
1388
1389int re_match(bufp, string, size, pos, regs)
1390regexp_t bufp;
1391char *string;
1392int size, pos;
1393regexp_registers_t regs;
1394{
1395 return re_match_2(bufp, string, size, (char *)NULL, 0, pos, regs, size);
1396}
1397
1398int re_search_2(bufp, string1, size1, string2, size2, pos, range, regs,
1399 mstop)
1400regexp_t bufp;
1401char *string1, *string2;
1402int size1, size2, pos, range, mstop;
1403regexp_registers_t regs;
1404{
1405 char *fastmap, *translate, *text, *partstart, *partend;
1406 int dir, ret;
1407 char anchor;
1408
1409 assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
Guido van Rossum521f81c1992-01-26 18:23:48 +00001410 assert(pos + range >= 0 && pos + range <= size1 + size2); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001411 assert(pos <= mstop);
1412
1413 fastmap = bufp->fastmap;
1414 translate = bufp->translate;
1415 if (fastmap && !bufp->fastmap_accurate)
1416 re_compile_fastmap(bufp);
1417 anchor = bufp->anchor;
1418 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1419 fastmap = NULL;
1420 if (range < 0)
1421 {
1422 dir = -1;
1423 range = -range;
1424 }
1425 else
1426 dir = 1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001427 if (anchor == 2)
1428 if (pos != 0)
1429 return -1;
1430 else
1431 range = 0;
1432 for (; range >= 0; range--, pos += dir)
1433 {
1434 if (fastmap)
1435 {
1436 if (dir == 1)
1437 { /* searching forwards */
1438 if (pos < size1)
1439 {
1440 text = string1 + pos;
1441 if (pos + range > size1)
1442 partend = string1 + size1;
1443 else
1444 partend = string1 + pos + range;
1445 }
1446 else
1447 {
1448 text = string2 + pos - size1;
1449 partend = string2 + pos + range - size1;
1450 }
1451 partstart = text;
1452 if (translate)
1453 while (text != partend &&
1454 !fastmap[(unsigned char)
1455 translate[(unsigned char)*text]])
1456 text++;
1457 else
1458 while (text != partend && !fastmap[(unsigned char)*text])
1459 text++;
1460 pos += text - partstart;
1461 range -= text - partstart;
1462 if (pos == size1 + size2 && bufp->can_be_null == 0)
1463 return -1;
1464 }
1465 else
1466 { /* searching backwards */
1467 if (pos <= size1)
1468 {
1469 text = string1 + pos;
1470 partstart = string1 + pos - range;
1471 }
1472 else
1473 {
1474 text = string2 + pos - size1;
1475 if (range < pos - size1)
1476 partstart = string2 + pos - size1 - range;
1477 else
1478 partstart = string2;
1479 }
1480 partend = text;
1481 if (translate)
1482 while (text != partstart &&
1483 !fastmap[(unsigned char)
1484 translate[(unsigned char)*text]])
1485 text--;
1486 else
1487 while (text != partstart &&
1488 !fastmap[(unsigned char)*text])
1489 text--;
1490 pos -= partend - text;
1491 range -= partend - text;
1492 }
1493 }
1494 if (anchor == 1)
1495 { /* anchored to begline */
1496 if (pos > 0 &&
1497 (pos <= size1 ? string1[pos - 1] :
1498 string2[pos - size1 - 1]) != '\n')
1499 continue;
1500 }
1501 assert(pos >= 0 && pos <= size1 + size2);
1502 ret = re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop);
1503 if (ret >= 0)
1504 return pos;
1505 if (ret == -2)
1506 return -2;
1507 }
1508 return -1;
1509}
1510
1511int re_search(bufp, string, size, startpos, range, regs)
1512regexp_t bufp;
1513char *string;
1514int size, startpos, range;
1515regexp_registers_t regs;
1516{
1517 return re_search_2(bufp, string, size, (char *)NULL, 0,
1518 startpos, range, regs, size);
1519}
1520
Guido van Rossum9abc5391992-03-27 17:24:37 +00001521#ifdef UNUSED
1522
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001523static struct re_pattern_buffer re_comp_buf;
1524
1525char *re_comp(s)
1526char *s;
1527{
1528 if (s == NULL)
1529 {
1530 if (!re_comp_buf.buffer)
1531 return "Out of memory";
1532 return NULL;
1533 }
1534 if (!re_comp_buf.buffer)
1535 {
1536 /* the buffer will be allocated automatically */
1537 re_comp_buf.fastmap = malloc(256);
1538 re_comp_buf.translate = NULL;
Guido van Rossumc65a5251994-08-05 13:44:50 +00001539 if (re_comp_buf.fastmap == NULL)
1540 return "Out of memory";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001541 }
1542 return re_compile_pattern(s, strlen(s), &re_comp_buf);
1543}
1544
1545int re_exec(s)
1546char *s;
1547{
1548 int len = strlen(s);
1549
1550 return re_search(&re_comp_buf, s, len, 0, len, (regexp_registers_t)NULL) >= 0;
1551}
1552
Guido van Rossum9abc5391992-03-27 17:24:37 +00001553#endif
1554
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001555#ifdef TEST_REGEXP
1556
1557int main()
1558{
1559 char buf[500];
1560 char *cp;
1561 struct re_pattern_buffer exp;
1562 struct re_registers regs;
1563 int a,pos;
1564 char fastmap[256];
1565
1566 exp.allocated = 0;
1567 exp.buffer = 0;
1568 exp.translate = NULL;
1569 exp.fastmap = fastmap;
1570
1571 /* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
1572
1573 while (1)
1574 {
1575 printf("Enter regexp:\n");
1576 gets(buf);
1577 cp=re_compile_pattern(buf, strlen(buf), &exp);
1578 if (cp)
1579 {
1580 printf("Error: %s\n", cp);
1581 continue;
1582 }
1583 re_compile_fastmap(&exp);
1584 printf("dump:\n");
1585 for (pos = 0; pos < exp.used;)
1586 {
1587 printf("%d: ", pos);
1588 switch (exp.buffer[pos++])
1589 {
1590 case Cend:
1591 strcpy(buf, "end");
1592 break;
1593 case Cbol:
1594 strcpy(buf, "bol");
1595 break;
1596 case Ceol:
1597 strcpy(buf, "eol");
1598 break;
1599 case Cset:
1600 strcpy(buf, "set ");
1601 for (a = 0; a < 256/8; a++)
1602 sprintf(buf+strlen(buf)," %02x",
1603 (unsigned char)exp.buffer[pos++]);
1604 break;
1605 case Cexact:
1606 sprintf(buf, "exact '%c' 0x%x", exp.buffer[pos],
1607 (unsigned char)exp.buffer[pos]);
1608 pos++;
1609 break;
1610 case Canychar:
1611 strcpy(buf, "anychar");
1612 break;
1613 case Cstart_memory:
1614 sprintf(buf, "start_memory %d", exp.buffer[pos++]);
1615 break;
1616 case Cend_memory:
1617 sprintf(buf, "end_memory %d", exp.buffer[pos++]);
1618 break;
1619 case Cmatch_memory:
1620 sprintf(buf, "match_memory %d", exp.buffer[pos++]);
1621 break;
1622 case Cjump:
1623 case Cdummy_failure_jump:
1624 case Cstar_jump:
1625 case Cfailure_jump:
1626 case Cupdate_failure_jump:
1627 a = (unsigned char)exp.buffer[pos++];
1628 a += (unsigned char)exp.buffer[pos++] << 8;
1629 a = (int)(short)a;
1630 switch (exp.buffer[pos-3])
1631 {
1632 case Cjump:
1633 cp = "jump";
1634 break;
1635 case Cstar_jump:
1636 cp = "star_jump";
1637 break;
1638 case Cfailure_jump:
1639 cp = "failure_jump";
1640 break;
1641 case Cupdate_failure_jump:
1642 cp = "update_failure_jump";
1643 break;
1644 case Cdummy_failure_jump:
1645 cp = "dummy_failure_jump";
1646 break;
1647 default:
1648 cp = "unknown jump";
1649 break;
1650 }
1651 sprintf(buf, "%s %d", cp, a + pos);
1652 break;
1653 case Cbegbuf:
1654 strcpy(buf,"begbuf");
1655 break;
1656 case Cendbuf:
1657 strcpy(buf,"endbuf");
1658 break;
1659 case Cwordbeg:
1660 strcpy(buf,"wordbeg");
1661 break;
1662 case Cwordend:
1663 strcpy(buf,"wordend");
1664 break;
1665 case Cwordbound:
1666 strcpy(buf,"wordbound");
1667 break;
1668 case Cnotwordbound:
1669 strcpy(buf,"notwordbound");
1670 break;
1671 default:
1672 sprintf(buf, "unknown code %d",
1673 (unsigned char)exp.buffer[pos - 1]);
1674 break;
1675 }
1676 printf("%s\n", buf);
1677 }
1678 printf("can_be_null = %d uses_registers = %d anchor = %d\n",
1679 exp.can_be_null, exp.uses_registers, exp.anchor);
1680
1681 printf("fastmap:");
1682 for (a = 0; a < 256; a++)
1683 if (exp.fastmap[a])
1684 printf(" %d", a);
1685 printf("\n");
1686 printf("Enter strings. An empty line terminates.\n");
1687 while (fgets(buf, sizeof(buf), stdin))
1688 {
1689 if (buf[0] == '\n')
1690 break;
1691 a = re_search(&exp, buf, strlen(buf), 0, strlen(buf), &regs);
1692 printf("search returns %d\n", a);
1693 if (a != -1)
1694 {
1695 for (a = 0; a < RE_NREGS; a++)
1696 {
1697 printf("buf %d: %d to %d\n", a, regs.start[a], regs.end[a]);
1698 }
1699 }
1700 }
1701 }
1702}
1703
1704#endif /* TEST_REGEXP */