blob: 545fb7d1689bf0c11d34d80f863953a6e8a587ed [file] [log] [blame]
Brian Kernighan87b94932012-12-22 10:35:39 -05001/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25/* lasciate ogne speranza, voi ch'intrate. */
26
27#define DEBUG
28
29#include <ctype.h>
Leonardo Taccari05014f52018-08-29 18:06:33 +020030#include <limits.h>
Brian Kernighan87b94932012-12-22 10:35:39 -050031#include <stdio.h>
32#include <string.h>
33#include <stdlib.h>
34#include "awk.h"
35#include "ytab.h"
36
Brian Kernighan87b94932012-12-22 10:35:39 -050037#define MAXLIN 22
38
39#define type(v) (v)->nobj /* badly overloaded here */
40#define info(v) (v)->ntype /* badly overloaded here */
41#define left(v) (v)->narg[0]
42#define right(v) (v)->narg[1]
43#define parent(v) (v)->nnext
44
45#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46#define ELEAF case EMPTYRE: /* empty string in regexp */
47#define UNARY case STAR: case PLUS: case QUEST:
48
49/* encoding in tree Nodes:
50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 left is index, right contains value or pointer to value
52 unary (STAR, PLUS, QUEST): left is child, right is null
53 binary (CAT, OR): left and right are children
54 parent contains pointer to parent
55*/
56
57
58int *setvec;
59int *tmpset;
60int maxsetvec = 0;
61
62int rtok; /* next token in current re */
63int rlxval;
zoulasc65892082019-10-24 09:40:15 -040064static const uschar *rlxstr;
65static const uschar *prestr; /* current position in current re */
66static const uschar *lastre; /* origin of last re */
67static const uschar *lastatom; /* origin of last Atom */
68static const uschar *starttok;
69static const uschar *basestr; /* starts with original, replaced during
Martijn Dekker8a222222019-01-23 09:12:27 +000070 repetition processing */
zoulasc65892082019-10-24 09:40:15 -040071static const uschar *firstbasestr;
Brian Kernighan87b94932012-12-22 10:35:39 -050072
73static int setcnt;
74static int poscnt;
75
zoulasc65892082019-10-24 09:40:15 -040076const char *patbeg;
Brian Kernighan87b94932012-12-22 10:35:39 -050077int patlen;
78
zoulascc16e8692019-10-17 13:04:46 -040079#define NFA 128 /* cache this many dynamic fa's */
Brian Kernighan87b94932012-12-22 10:35:39 -050080fa *fatab[NFA];
81int nfatab = 0; /* entries in fatab */
82
zoulascc16e8692019-10-17 13:04:46 -040083static int *
84intalloc(size_t n, const char *f)
85{
86 void *p = calloc(n, sizeof(int));
87 if (p == NULL)
88 overflo(f);
89 return p;
90}
91
92static void
93resizesetvec(const char *f)
94{
95 if (maxsetvec == 0)
96 maxsetvec = MAXLIN;
97 else
98 maxsetvec *= 4;
99 setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
100 tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
101 if (setvec == NULL || tmpset == NULL)
102 overflo(f);
103}
104
105static void
106resize_state(fa *f, int state)
107{
108 void *p;
109 int i, new_count;
110
111 if (++state < f->state_count)
112 return;
113
114 new_count = state + 10; /* needs to be tuned */
115
116 p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
117 if (p == NULL)
118 goto out;
119 f->gototab = p;
120
121 p = realloc(f->out, new_count * sizeof(f->out[0]));
122 if (p == NULL)
123 goto out;
124 f->out = p;
125
126 p = realloc(f->posns, new_count * sizeof(f->posns[0]));
127 if (p == NULL)
128 goto out;
129 f->posns = p;
130
131 for (i = f->state_count; i < new_count; ++i) {
132 f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab));
133 if (f->gototab[i] == NULL)
134 goto out;
135 f->out[i] = 0;
136 f->posns[i] = NULL;
137 }
138 f->state_count = new_count;
139 return;
140out:
141 overflo(__func__);
142}
143
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200144fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
Brian Kernighan87b94932012-12-22 10:35:39 -0500145{
146 int i, use, nuse;
147 fa *pfa;
148 static int now = 1;
149
pfg52421942016-06-03 21:23:11 +0000150 if (setvec == NULL) { /* first time through any RE */
zoulascc16e8692019-10-17 13:04:46 -0400151 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500152 }
153
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200154 if (compile_time != RUNNING) /* a constant for sure */
Brian Kernighan87b94932012-12-22 10:35:39 -0500155 return mkdfa(s, anchor);
156 for (i = 0; i < nfatab; i++) /* is it there already? */
157 if (fatab[i]->anchor == anchor
158 && strcmp((const char *) fatab[i]->restr, s) == 0) {
159 fatab[i]->use = now++;
160 return fatab[i];
161 }
162 pfa = mkdfa(s, anchor);
163 if (nfatab < NFA) { /* room for another */
164 fatab[nfatab] = pfa;
165 fatab[nfatab]->use = now++;
166 nfatab++;
167 return pfa;
168 }
169 use = fatab[0]->use; /* replace least-recently used */
170 nuse = 0;
171 for (i = 1; i < nfatab; i++)
172 if (fatab[i]->use < use) {
173 use = fatab[i]->use;
174 nuse = i;
175 }
176 freefa(fatab[nuse]);
177 fatab[nuse] = pfa;
178 pfa->use = now++;
179 return pfa;
180}
181
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200182fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200183 /* anchor = true for anchored matches, else false */
Brian Kernighan87b94932012-12-22 10:35:39 -0500184{
185 Node *p, *p1;
186 fa *f;
187
zoulasc65892082019-10-24 09:40:15 -0400188 firstbasestr = (const uschar *) s;
Martijn Dekker8a222222019-01-23 09:12:27 +0000189 basestr = firstbasestr;
Brian Kernighan87b94932012-12-22 10:35:39 -0500190 p = reparse(s);
191 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
192 /* put ALL STAR in front of reg. exp. */
193 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
194 /* put FINAL after reg. exp. */
195
196 poscnt = 0;
197 penter(p1); /* enter parent pointers and leaf indices */
zoulasc65892082019-10-24 09:40:15 -0400198 if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
zoulascc16e8692019-10-17 13:04:46 -0400199 overflo(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500200 f->accept = poscnt-1; /* penter has computed number of positions in re */
201 cfoll(f, p1); /* set up follow sets */
202 freetr(p1);
zoulascc16e8692019-10-17 13:04:46 -0400203 resize_state(f, 1);
204 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
205 f->posns[1] = intalloc(1, __func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500206 *f->posns[1] = 0;
207 f->initstat = makeinit(f, anchor);
208 f->anchor = anchor;
209 f->restr = (uschar *) tostring(s);
Martijn Dekker8a222222019-01-23 09:12:27 +0000210 if (firstbasestr != basestr) {
211 if (basestr)
212 xfree(basestr);
213 }
Brian Kernighan87b94932012-12-22 10:35:39 -0500214 return f;
215}
216
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200217int makeinit(fa *f, bool anchor)
Brian Kernighan87b94932012-12-22 10:35:39 -0500218{
219 int i, k;
220
221 f->curstat = 2;
222 f->out[2] = 0;
Brian Kernighan87b94932012-12-22 10:35:39 -0500223 k = *(f->re[0].lfollow);
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600224 xfree(f->posns[2]);
zoulascc16e8692019-10-17 13:04:46 -0400225 f->posns[2] = intalloc(k + 1, __func__);
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200226 for (i = 0; i <= k; i++) {
Brian Kernighan87b94932012-12-22 10:35:39 -0500227 (f->posns[2])[i] = (f->re[0].lfollow)[i];
228 }
229 if ((f->posns[2])[1] == f->accept)
230 f->out[2] = 1;
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200231 for (i = 0; i < NCHARS; i++)
Brian Kernighan87b94932012-12-22 10:35:39 -0500232 f->gototab[2][i] = 0;
233 f->curstat = cgoto(f, 2, HAT);
234 if (anchor) {
235 *f->posns[2] = k-1; /* leave out position 0 */
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200236 for (i = 0; i < k; i++) {
Brian Kernighan87b94932012-12-22 10:35:39 -0500237 (f->posns[0])[i] = (f->posns[2])[i];
238 }
239
240 f->out[0] = f->out[2];
241 if (f->curstat != 2)
242 --(*f->posns[f->curstat]);
243 }
244 return f->curstat;
245}
246
247void penter(Node *p) /* set up parent pointers and leaf indices */
248{
249 switch (type(p)) {
250 ELEAF
251 LEAF
252 info(p) = poscnt;
253 poscnt++;
254 break;
255 UNARY
256 penter(left(p));
257 parent(left(p)) = p;
258 break;
259 case CAT:
260 case OR:
261 penter(left(p));
262 penter(right(p));
263 parent(left(p)) = p;
264 parent(right(p)) = p;
265 break;
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200266 case ZERO:
267 break;
Brian Kernighan87b94932012-12-22 10:35:39 -0500268 default: /* can't happen */
269 FATAL("can't happen: unknown type %d in penter", type(p));
270 break;
271 }
272}
273
274void freetr(Node *p) /* free parse tree */
275{
276 switch (type(p)) {
277 ELEAF
278 LEAF
279 xfree(p);
280 break;
281 UNARY
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200282 case ZERO:
Brian Kernighan87b94932012-12-22 10:35:39 -0500283 freetr(left(p));
284 xfree(p);
285 break;
286 case CAT:
287 case OR:
288 freetr(left(p));
289 freetr(right(p));
290 xfree(p);
291 break;
292 default: /* can't happen */
293 FATAL("can't happen: unknown type %d in freetr", type(p));
294 break;
295 }
296}
297
298/* in the parsing of regular expressions, metacharacters like . have */
299/* to be seen literally; \056 is not a metacharacter. */
300
zoulasc65892082019-10-24 09:40:15 -0400301int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
Brian Kernighan87b94932012-12-22 10:35:39 -0500302{ /* only pick up one 8-bit byte (2 chars) */
zoulasc65892082019-10-24 09:40:15 -0400303 const uschar *p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500304 int n = 0;
305 int i;
306
zoulasc65892082019-10-24 09:40:15 -0400307 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
Brian Kernighan87b94932012-12-22 10:35:39 -0500308 if (isdigit(*p))
309 n = 16 * n + *p - '0';
310 else if (*p >= 'a' && *p <= 'f')
311 n = 16 * n + *p - 'a' + 10;
312 else if (*p >= 'A' && *p <= 'F')
313 n = 16 * n + *p - 'A' + 10;
314 }
zoulasc65892082019-10-24 09:40:15 -0400315 *pp = p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500316 return n;
317}
318
319#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
320
zoulasc65892082019-10-24 09:40:15 -0400321int quoted(const uschar **pp) /* pick up next thing after a \\ */
Brian Kernighan87b94932012-12-22 10:35:39 -0500322 /* and increment *pp */
323{
zoulasc65892082019-10-24 09:40:15 -0400324 const uschar *p = *pp;
Brian Kernighan87b94932012-12-22 10:35:39 -0500325 int c;
326
327 if ((c = *p++) == 't')
328 c = '\t';
329 else if (c == 'n')
330 c = '\n';
331 else if (c == 'f')
332 c = '\f';
333 else if (c == 'r')
334 c = '\r';
335 else if (c == 'b')
336 c = '\b';
Martijn Dekker5b602ca2019-07-26 10:46:58 +0200337 else if (c == 'v')
338 c = '\v';
339 else if (c == 'a')
340 c = '\a';
Brian Kernighan87b94932012-12-22 10:35:39 -0500341 else if (c == '\\')
342 c = '\\';
343 else if (c == 'x') { /* hexadecimal goo follows */
344 c = hexstr(&p); /* this adds a null if number is invalid */
345 } else if (isoctdigit(c)) { /* \d \dd \ddd */
346 int n = c - '0';
347 if (isoctdigit(*p)) {
348 n = 8 * n + *p++ - '0';
349 if (isoctdigit(*p))
350 n = 8 * n + *p++ - '0';
351 }
352 c = n;
353 } /* else */
354 /* c = c; */
355 *pp = p;
356 return c;
357}
358
359char *cclenter(const char *argp) /* add a character class */
360{
361 int i, c, c2;
zoulasc65892082019-10-24 09:40:15 -0400362 const uschar *op, *p = (const uschar *) argp;
363 uschar *bp;
pfg52421942016-06-03 21:23:11 +0000364 static uschar *buf = NULL;
Brian Kernighan87b94932012-12-22 10:35:39 -0500365 static int bufsz = 100;
366
367 op = p;
zoulascc16e8692019-10-17 13:04:46 -0400368 if (buf == NULL && (buf = malloc(bufsz)) == NULL)
Brian Kernighan87b94932012-12-22 10:35:39 -0500369 FATAL("out of space for character class [%.10s...] 1", p);
370 bp = buf;
371 for (i = 0; (c = *p++) != 0; ) {
372 if (c == '\\') {
373 c = quoted(&p);
374 } else if (c == '-' && i > 0 && bp[-1] != 0) {
375 if (*p != 0) {
376 c = bp[-1];
377 c2 = *p++;
378 if (c2 == '\\')
379 c2 = quoted(&p);
380 if (c > c2) { /* empty; ignore */
381 bp--;
382 i--;
383 continue;
384 }
385 while (c < c2) {
386 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
387 FATAL("out of space for character class [%.10s...] 2", p);
388 *bp++ = ++c;
389 i++;
390 }
391 continue;
392 }
393 }
394 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
395 FATAL("out of space for character class [%.10s...] 3", p);
396 *bp++ = c;
397 i++;
398 }
399 *bp = 0;
400 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
401 xfree(op);
402 return (char *) tostring((char *) buf);
403}
404
405void overflo(const char *s)
406{
zoulascc16e8692019-10-17 13:04:46 -0400407 FATAL("regular expression too big: out of space in %.30s...", s);
Brian Kernighan87b94932012-12-22 10:35:39 -0500408}
409
410void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
411{
412 int i;
413 int *p;
414
415 switch (type(v)) {
416 ELEAF
417 LEAF
418 f->re[info(v)].ltype = type(v);
419 f->re[info(v)].lval.np = right(v);
420 while (f->accept >= maxsetvec) { /* guessing here! */
zoulascc16e8692019-10-17 13:04:46 -0400421 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500422 }
423 for (i = 0; i <= f->accept; i++)
424 setvec[i] = 0;
425 setcnt = 0;
426 follow(v); /* computes setvec and setcnt */
zoulascc16e8692019-10-17 13:04:46 -0400427 p = intalloc(setcnt + 1, __func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500428 f->re[info(v)].lfollow = p;
429 *p = setcnt;
430 for (i = f->accept; i >= 0; i--)
431 if (setvec[i] == 1)
432 *++p = i;
433 break;
434 UNARY
435 cfoll(f,left(v));
436 break;
437 case CAT:
438 case OR:
439 cfoll(f,left(v));
440 cfoll(f,right(v));
441 break;
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200442 case ZERO:
443 break;
Brian Kernighan87b94932012-12-22 10:35:39 -0500444 default: /* can't happen */
445 FATAL("can't happen: unknown type %d in cfoll", type(v));
446 }
447}
448
449int first(Node *p) /* collects initially active leaves of p into setvec */
450 /* returns 0 if p matches empty string */
451{
452 int b, lp;
453
454 switch (type(p)) {
455 ELEAF
456 LEAF
457 lp = info(p); /* look for high-water mark of subscripts */
458 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
zoulascc16e8692019-10-17 13:04:46 -0400459 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500460 }
461 if (type(p) == EMPTYRE) {
462 setvec[lp] = 0;
463 return(0);
464 }
465 if (setvec[lp] != 1) {
466 setvec[lp] = 1;
467 setcnt++;
468 }
469 if (type(p) == CCL && (*(char *) right(p)) == '\0')
470 return(0); /* empty CCL */
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200471 return(1);
Brian Kernighan87b94932012-12-22 10:35:39 -0500472 case PLUS:
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200473 if (first(left(p)) == 0)
474 return(0);
Brian Kernighan87b94932012-12-22 10:35:39 -0500475 return(1);
476 case STAR:
477 case QUEST:
478 first(left(p));
479 return(0);
480 case CAT:
481 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
482 return(1);
483 case OR:
484 b = first(right(p));
485 if (first(left(p)) == 0 || b == 0) return(0);
486 return(1);
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200487 case ZERO:
488 return 0;
Brian Kernighan87b94932012-12-22 10:35:39 -0500489 }
490 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
491 return(-1);
492}
493
494void follow(Node *v) /* collects leaves that can follow v into setvec */
495{
496 Node *p;
497
498 if (type(v) == FINAL)
499 return;
500 p = parent(v);
501 switch (type(p)) {
502 case STAR:
503 case PLUS:
504 first(v);
505 follow(p);
506 return;
507
508 case OR:
509 case QUEST:
510 follow(p);
511 return;
512
513 case CAT:
514 if (v == left(p)) { /* v is left child of p */
515 if (first(right(p)) == 0) {
516 follow(p);
517 return;
518 }
519 } else /* v is right child */
520 follow(p);
521 return;
522 }
523}
524
525int member(int c, const char *sarg) /* is c in s? */
526{
zoulasc65892082019-10-24 09:40:15 -0400527 const uschar *s = (const uschar *) sarg;
Brian Kernighan87b94932012-12-22 10:35:39 -0500528
529 while (*s)
530 if (c == *s++)
531 return(1);
532 return(0);
533}
534
535int match(fa *f, const char *p0) /* shortest match ? */
536{
537 int s, ns;
zoulasc65892082019-10-24 09:40:15 -0400538 const uschar *p = (const uschar *) p0;
Brian Kernighan87b94932012-12-22 10:35:39 -0500539
zoulascc16e8692019-10-17 13:04:46 -0400540 s = f->initstat;
541 assert (s < f->state_count);
542
Brian Kernighan87b94932012-12-22 10:35:39 -0500543 if (f->out[s])
544 return(1);
545 do {
546 /* assert(*p < NCHARS); */
547 if ((ns = f->gototab[s][*p]) != 0)
548 s = ns;
549 else
550 s = cgoto(f, s, *p);
551 if (f->out[s])
552 return(1);
553 } while (*p++ != 0);
554 return(0);
555}
556
557int pmatch(fa *f, const char *p0) /* longest match, for sub */
558{
559 int s, ns;
zoulasc65892082019-10-24 09:40:15 -0400560 const uschar *p = (const uschar *) p0;
561 const uschar *q;
Brian Kernighan87b94932012-12-22 10:35:39 -0500562
zoulascc16e8692019-10-17 13:04:46 -0400563 s = f->initstat;
564 assert(s < f->state_count);
565
zoulasc65892082019-10-24 09:40:15 -0400566 patbeg = (const char *)p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500567 patlen = -1;
568 do {
569 q = p;
570 do {
571 if (f->out[s]) /* final state */
572 patlen = q-p;
573 /* assert(*q < NCHARS); */
574 if ((ns = f->gototab[s][*q]) != 0)
575 s = ns;
576 else
577 s = cgoto(f, s, *q);
zoulascc16e8692019-10-17 13:04:46 -0400578
579 assert(s < f->state_count);
580
Brian Kernighan87b94932012-12-22 10:35:39 -0500581 if (s == 1) { /* no transition */
582 if (patlen >= 0) {
zoulasc65892082019-10-24 09:40:15 -0400583 patbeg = (const char *) p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500584 return(1);
585 }
586 else
587 goto nextin; /* no match */
588 }
589 } while (*q++ != 0);
590 if (f->out[s])
591 patlen = q-p-1; /* don't count $ */
592 if (patlen >= 0) {
zoulasc65892082019-10-24 09:40:15 -0400593 patbeg = (const char *) p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500594 return(1);
595 }
596 nextin:
597 s = 2;
zoulascc16e8692019-10-17 13:04:46 -0400598 } while (*p++);
Brian Kernighan87b94932012-12-22 10:35:39 -0500599 return (0);
600}
601
602int nematch(fa *f, const char *p0) /* non-empty match, for sub */
603{
604 int s, ns;
zoulasc65892082019-10-24 09:40:15 -0400605 const uschar *p = (const uschar *) p0;
606 const uschar *q;
Brian Kernighan87b94932012-12-22 10:35:39 -0500607
zoulascc16e8692019-10-17 13:04:46 -0400608 s = f->initstat;
609 assert(s < f->state_count);
610
zoulasc65892082019-10-24 09:40:15 -0400611 patbeg = (const char *)p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500612 patlen = -1;
613 while (*p) {
614 q = p;
615 do {
616 if (f->out[s]) /* final state */
617 patlen = q-p;
618 /* assert(*q < NCHARS); */
619 if ((ns = f->gototab[s][*q]) != 0)
620 s = ns;
621 else
622 s = cgoto(f, s, *q);
623 if (s == 1) { /* no transition */
624 if (patlen > 0) {
zoulasc65892082019-10-24 09:40:15 -0400625 patbeg = (const char *) p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500626 return(1);
627 } else
628 goto nnextin; /* no nonempty match */
629 }
630 } while (*q++ != 0);
631 if (f->out[s])
632 patlen = q-p-1; /* don't count $ */
633 if (patlen > 0 ) {
zoulasc65892082019-10-24 09:40:15 -0400634 patbeg = (const char *) p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500635 return(1);
636 }
637 nnextin:
638 s = 2;
Brian Kernighan87b94932012-12-22 10:35:39 -0500639 p++;
640 }
641 return (0);
642}
643
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300644
645/*
646 * NAME
647 * fnematch
648 *
649 * DESCRIPTION
650 * A stream-fed version of nematch which transfers characters to a
651 * null-terminated buffer. All characters up to and including the last
652 * character of the matching text or EOF are placed in the buffer. If
653 * a match is found, patbeg and patlen are set appropriately.
654 *
655 * RETURN VALUES
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200656 * false No match found.
657 * true Match found.
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300658 */
659
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200660bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300661{
Arnold D. Robbins5ff28202019-10-07 15:50:53 +0300662 char *buf = *pbuf;
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300663 int bufsize = *pbufsize;
664 int c, i, j, k, ns, s;
665
666 s = pfa->initstat;
667 patlen = 0;
668
669 /*
670 * All indices relative to buf.
671 * i <= j <= k <= bufsize
672 *
673 * i: origin of active substring
674 * j: current character
675 * k: destination of next getc()
676 */
677 i = -1, k = 0;
678 do {
679 j = i++;
680 do {
681 if (++j == k) {
682 if (k == bufsize)
683 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
684 FATAL("stream '%.30s...' too long", buf);
685 buf[k++] = (c = getc(f)) != EOF ? c : 0;
686 }
687 c = buf[j];
688 /* assert(c < NCHARS); */
689
690 if ((ns = pfa->gototab[s][c]) != 0)
691 s = ns;
692 else
693 s = cgoto(pfa, s, c);
694
695 if (pfa->out[s]) { /* final state */
696 patlen = j - i + 1;
697 if (c == 0) /* don't count $ */
698 patlen--;
699 }
700 } while (buf[j] && s != 1);
701 s = 2;
702 } while (buf[i] && !patlen);
703
704 /* adjbuf() may have relocated a resized buffer. Inform the world. */
705 *pbuf = buf;
706 *pbufsize = bufsize;
707
708 if (patlen) {
709 patbeg = (char *) buf + i;
710 /*
711 * Under no circumstances is the last character fed to
712 * the automaton part of the match. It is EOF's nullbyte,
713 * or it sent the automaton into a state with no further
714 * transitions available (s==1), or both. Room for a
715 * terminating nullbyte is guaranteed.
716 *
717 * ungetc any chars after the end of matching text
718 * (except for EOF's nullbyte, if present) and null
719 * terminate the buffer.
720 */
721 do
722 if (buf[--k] && ungetc(buf[k], f) == EOF)
723 FATAL("unable to ungetc '%c'", buf[k]);
724 while (k > i + patlen);
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200725 buf[k] = '\0';
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200726 return true;
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300727 }
728 else
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200729 return false;
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300730}
731
Brian Kernighan87b94932012-12-22 10:35:39 -0500732Node *reparse(const char *p) /* parses regular expression pointed to by p */
733{ /* uses relex() to scan regular expression */
734 Node *np;
735
736 dprintf( ("reparse <%s>\n", p) );
zoulasc65892082019-10-24 09:40:15 -0400737 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
Brian Kernighan87b94932012-12-22 10:35:39 -0500738 rtok = relex();
739 /* GNU compatibility: an empty regexp matches anything */
740 if (rtok == '\0') {
741 /* FATAL("empty regular expression"); previous */
742 return(op2(EMPTYRE, NIL, NIL));
743 }
744 np = regexp();
745 if (rtok != '\0')
746 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
747 return(np);
748}
749
750Node *regexp(void) /* top-level parse of reg expr */
751{
752 return (alt(concat(primary())));
753}
754
755Node *primary(void)
756{
757 Node *np;
Martijn Dekker8a222222019-01-23 09:12:27 +0000758 int savelastatom;
Brian Kernighan87b94932012-12-22 10:35:39 -0500759
760 switch (rtok) {
761 case CHAR:
Martijn Dekker8a222222019-01-23 09:12:27 +0000762 lastatom = starttok;
Brian Kernighan87b94932012-12-22 10:35:39 -0500763 np = op2(CHAR, NIL, itonp(rlxval));
764 rtok = relex();
765 return (unary(np));
766 case ALL:
767 rtok = relex();
768 return (unary(op2(ALL, NIL, NIL)));
769 case EMPTYRE:
770 rtok = relex();
Martijn Dekker8a222222019-01-23 09:12:27 +0000771 return (unary(op2(EMPTYRE, NIL, NIL)));
Brian Kernighan87b94932012-12-22 10:35:39 -0500772 case DOT:
Martijn Dekker8a222222019-01-23 09:12:27 +0000773 lastatom = starttok;
Brian Kernighan87b94932012-12-22 10:35:39 -0500774 rtok = relex();
775 return (unary(op2(DOT, NIL, NIL)));
776 case CCL:
zoulasc65892082019-10-24 09:40:15 -0400777 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
Martijn Dekker8a222222019-01-23 09:12:27 +0000778 lastatom = starttok;
Brian Kernighan87b94932012-12-22 10:35:39 -0500779 rtok = relex();
780 return (unary(np));
781 case NCCL:
zoulasc65892082019-10-24 09:40:15 -0400782 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
Martijn Dekker8a222222019-01-23 09:12:27 +0000783 lastatom = starttok;
Brian Kernighan87b94932012-12-22 10:35:39 -0500784 rtok = relex();
785 return (unary(np));
786 case '^':
787 rtok = relex();
788 return (unary(op2(CHAR, NIL, itonp(HAT))));
789 case '$':
790 rtok = relex();
791 return (unary(op2(CHAR, NIL, NIL)));
792 case '(':
Martijn Dekker8a222222019-01-23 09:12:27 +0000793 lastatom = starttok;
794 savelastatom = starttok - basestr; /* Retain over recursion */
Brian Kernighan87b94932012-12-22 10:35:39 -0500795 rtok = relex();
796 if (rtok == ')') { /* special pleading for () */
797 rtok = relex();
798 return unary(op2(CCL, NIL, (Node *) tostring("")));
799 }
800 np = regexp();
801 if (rtok == ')') {
Martijn Dekker8a222222019-01-23 09:12:27 +0000802 lastatom = basestr + savelastatom; /* Restore */
Brian Kernighan87b94932012-12-22 10:35:39 -0500803 rtok = relex();
804 return (unary(np));
805 }
806 else
807 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
808 default:
809 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
810 }
811 return 0; /*NOTREACHED*/
812}
813
814Node *concat(Node *np)
815{
816 switch (rtok) {
Martijn Dekker8a222222019-01-23 09:12:27 +0000817 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
Brian Kernighan87b94932012-12-22 10:35:39 -0500818 return (concat(op2(CAT, np, primary())));
Martijn Dekker8a222222019-01-23 09:12:27 +0000819 case EMPTYRE:
820 rtok = relex();
821 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
822 primary())));
Brian Kernighan87b94932012-12-22 10:35:39 -0500823 }
824 return (np);
825}
826
827Node *alt(Node *np)
828{
829 if (rtok == OR) {
830 rtok = relex();
831 return (alt(op2(OR, np, concat(primary()))));
832 }
833 return (np);
834}
835
836Node *unary(Node *np)
837{
838 switch (rtok) {
839 case STAR:
840 rtok = relex();
841 return (unary(op2(STAR, np, NIL)));
842 case PLUS:
843 rtok = relex();
844 return (unary(op2(PLUS, np, NIL)));
845 case QUEST:
846 rtok = relex();
847 return (unary(op2(QUEST, np, NIL)));
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200848 case ZERO:
849 rtok = relex();
850 return (unary(op2(ZERO, np, NIL)));
Brian Kernighan87b94932012-12-22 10:35:39 -0500851 default:
852 return (np);
853 }
854}
855
856/*
857 * Character class definitions conformant to the POSIX locale as
858 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
859 * and operating character sets are both ASCII (ISO646) or supersets
860 * thereof.
861 *
862 * Note that to avoid overflowing the temporary buffer used in
863 * relex(), the expanded character class (prior to range expansion)
864 * must be less than twice the size of their full name.
865 */
866
867/* Because isblank doesn't show up in any of the header files on any
868 * system i use, it's defined here. if some other locale has a richer
869 * definition of "blank", define HAS_ISBLANK and provide your own
870 * version.
871 * the parentheses here are an attempt to find a path through the maze
872 * of macro definition and/or function and/or version provided. thanks
873 * to nelson beebe for the suggestion; let's see if it works everywhere.
874 */
875
876/* #define HAS_ISBLANK */
877#ifndef HAS_ISBLANK
878
879int (xisblank)(int c)
880{
881 return c==' ' || c=='\t';
882}
883
884#endif
885
zoulasc6a877092020-01-24 04:11:59 -0500886static const struct charclass {
Brian Kernighan87b94932012-12-22 10:35:39 -0500887 const char *cc_name;
888 int cc_namelen;
889 int (*cc_func)(int);
890} charclasses[] = {
891 { "alnum", 5, isalnum },
892 { "alpha", 5, isalpha },
893#ifndef HAS_ISBLANK
Arnold D. Robbins32093f52018-08-22 20:40:26 +0300894 { "blank", 5, xisblank },
Brian Kernighan87b94932012-12-22 10:35:39 -0500895#else
896 { "blank", 5, isblank },
897#endif
898 { "cntrl", 5, iscntrl },
899 { "digit", 5, isdigit },
900 { "graph", 5, isgraph },
901 { "lower", 5, islower },
902 { "print", 5, isprint },
903 { "punct", 5, ispunct },
904 { "space", 5, isspace },
905 { "upper", 5, isupper },
906 { "xdigit", 6, isxdigit },
907 { NULL, 0, NULL },
908};
909
Martijn Dekker8a222222019-01-23 09:12:27 +0000910#define REPEAT_SIMPLE 0
911#define REPEAT_PLUS_APPENDED 1
912#define REPEAT_WITH_Q 2
913#define REPEAT_ZERO 3
914
915static int
916replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
917 int atomlen, int firstnum, int secondnum, int special_case)
918{
919 int i, j;
920 uschar *buf = 0;
921 int ret = 1;
Arnold D. Robbins944989b2020-01-06 00:01:46 -0700922 int init_q = (firstnum == 0); /* first added char will be ? */
Martijn Dekker8a222222019-01-23 09:12:27 +0000923 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
924 int prefix_length = reptok - basestr; /* prefix includes first rep */
zoulasc65892082019-10-24 09:40:15 -0400925 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
Martijn Dekker8a222222019-01-23 09:12:27 +0000926 int size = prefix_length + suffix_length;
927
928 if (firstnum > 1) { /* add room for reps 2 through firstnum */
929 size += atomlen*(firstnum-1);
930 }
931
932 /* Adjust size of buffer for special cases */
933 if (special_case == REPEAT_PLUS_APPENDED) {
934 size++; /* for the final + */
935 } else if (special_case == REPEAT_WITH_Q) {
936 size += init_q + (atomlen+1)* n_q_reps;
937 } else if (special_case == REPEAT_ZERO) {
938 size += 2; /* just a null ERE: () */
939 }
zoulascc16e8692019-10-17 13:04:46 -0400940 if ((buf = malloc(size + 1)) == NULL)
Martijn Dekker8a222222019-01-23 09:12:27 +0000941 FATAL("out of space in reg expr %.10s..", lastre);
942 memcpy(buf, basestr, prefix_length); /* copy prefix */
943 j = prefix_length;
944 if (special_case == REPEAT_ZERO) {
945 j -= atomlen;
946 buf[j++] = '(';
947 buf[j++] = ')';
948 }
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +0200949 for (i = 1; i < firstnum; i++) { /* copy x reps */
Martijn Dekker8a222222019-01-23 09:12:27 +0000950 memcpy(&buf[j], atom, atomlen);
951 j += atomlen;
952 }
953 if (special_case == REPEAT_PLUS_APPENDED) {
954 buf[j++] = '+';
955 } else if (special_case == REPEAT_WITH_Q) {
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200956 if (init_q)
957 buf[j++] = '?';
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +0200958 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
Martijn Dekker8a222222019-01-23 09:12:27 +0000959 memcpy(&buf[j], atom, atomlen);
960 j += atomlen;
961 buf[j++] = '?';
962 }
963 }
964 memcpy(&buf[j], reptok+reptoklen, suffix_length);
965 if (special_case == REPEAT_ZERO) {
966 buf[j+suffix_length] = '\0';
967 } else {
968 buf[size] = '\0';
969 }
970 /* free old basestr */
971 if (firstbasestr != basestr) {
972 if (basestr)
973 xfree(basestr);
974 }
975 basestr = buf;
976 prestr = buf + prefix_length;
977 if (special_case == REPEAT_ZERO) {
978 prestr -= atomlen;
979 ret++;
980 }
981 return ret;
982}
983
984static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
985 int atomlen, int firstnum, int secondnum)
986{
987 /*
988 In general, the repetition specifier or "bound" is replaced here
989 by an equivalent ERE string, repeating the immediately previous atom
990 and appending ? and + as needed. Note that the first copy of the
991 atom is left in place, except in the special_case of a zero-repeat
992 (i.e., {0}).
993 */
994 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
995 if (firstnum < 2) {
996 /* 0 or 1: should be handled before you get here */
Martijn Dekker0619d5d2019-02-21 22:38:16 +0100997 FATAL("internal error");
Martijn Dekker8a222222019-01-23 09:12:27 +0000998 } else {
999 return replace_repeat(reptok, reptoklen, atom, atomlen,
1000 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1001 }
1002 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1003 if (firstnum == 0) { /* {0} or {0,0} */
1004 /* This case is unusual because the resulting
1005 replacement string might actually be SMALLER than
1006 the original ERE */
1007 return replace_repeat(reptok, reptoklen, atom, atomlen,
1008 firstnum, secondnum, REPEAT_ZERO);
1009 } else { /* (firstnum >= 1) */
1010 return replace_repeat(reptok, reptoklen, atom, atomlen,
1011 firstnum, secondnum, REPEAT_SIMPLE);
1012 }
1013 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1014 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1015 return replace_repeat(reptok, reptoklen, atom, atomlen,
1016 firstnum, secondnum, REPEAT_WITH_Q);
1017 } else { /* Error - shouldn't be here (n>m) */
Martijn Dekker0619d5d2019-02-21 22:38:16 +01001018 FATAL("internal error");
Martijn Dekker8a222222019-01-23 09:12:27 +00001019 }
1020 return 0;
1021}
Brian Kernighan87b94932012-12-22 10:35:39 -05001022
1023int relex(void) /* lexical analyzer for reparse */
1024{
1025 int c, n;
1026 int cflag;
pfg52421942016-06-03 21:23:11 +00001027 static uschar *buf = NULL;
Brian Kernighan87b94932012-12-22 10:35:39 -05001028 static int bufsz = 100;
1029 uschar *bp;
zoulasc6a877092020-01-24 04:11:59 -05001030 const struct charclass *cc;
Brian Kernighan87b94932012-12-22 10:35:39 -05001031 int i;
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001032 int num, m;
1033 bool commafound, digitfound;
Martijn Dekker8a222222019-01-23 09:12:27 +00001034 const uschar *startreptok;
Arnold D. Robbins28dacbd2019-06-04 23:53:31 -06001035 static int parens = 0;
Martijn Dekker8a222222019-01-23 09:12:27 +00001036
1037rescan:
1038 starttok = prestr;
Brian Kernighan87b94932012-12-22 10:35:39 -05001039
1040 switch (c = *prestr++) {
1041 case '|': return OR;
1042 case '*': return STAR;
1043 case '+': return PLUS;
1044 case '?': return QUEST;
1045 case '.': return DOT;
1046 case '\0': prestr--; return '\0';
1047 case '^':
1048 case '$':
Brian Kernighan87b94932012-12-22 10:35:39 -05001049 return c;
Arnold D. Robbins28dacbd2019-06-04 23:53:31 -06001050 case '(':
1051 parens++;
1052 return c;
1053 case ')':
1054 if (parens) {
1055 parens--;
1056 return c;
1057 }
1058 /* unmatched close parenthesis; per POSIX, treat as literal */
1059 rlxval = c;
1060 return CHAR;
Brian Kernighan87b94932012-12-22 10:35:39 -05001061 case '\\':
1062 rlxval = quoted(&prestr);
1063 return CHAR;
1064 default:
1065 rlxval = c;
1066 return CHAR;
Arnold D. Robbins795a06b2019-07-28 05:51:52 -06001067 case '[':
zoulascc16e8692019-10-17 13:04:46 -04001068 if (buf == NULL && (buf = malloc(bufsz)) == NULL)
Brian Kernighan87b94932012-12-22 10:35:39 -05001069 FATAL("out of space in reg expr %.10s..", lastre);
1070 bp = buf;
1071 if (*prestr == '^') {
1072 cflag = 1;
1073 prestr++;
1074 }
1075 else
1076 cflag = 0;
1077 n = 2 * strlen((const char *) prestr)+1;
1078 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1079 FATAL("out of space for reg expr %.10s...", lastre);
1080 for (; ; ) {
1081 if ((c = *prestr++) == '\\') {
1082 *bp++ = '\\';
1083 if ((c = *prestr++) == '\0')
1084 FATAL("nonterminated character class %.20s...", lastre);
1085 *bp++ = c;
1086 /* } else if (c == '\n') { */
1087 /* FATAL("newline in character class %.20s...", lastre); */
1088 } else if (c == '[' && *prestr == ':') {
1089 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1090 for (cc = charclasses; cc->cc_name; cc++)
1091 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1092 break;
1093 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1094 prestr[2 + cc->cc_namelen] == ']') {
1095 prestr += cc->cc_namelen + 3;
Cody Peter Melloa6392ef2018-11-12 10:25:44 -08001096 /*
1097 * BUG: We begin at 1, instead of 0, since we
1098 * would otherwise prematurely terminate the
1099 * string for classes like [[:cntrl:]]. This
1100 * means that we can't match the NUL character,
1101 * not without first adapting the entire
1102 * program to track each string's length.
1103 */
Leonardo Taccari031aac82019-01-28 17:34:58 +01001104 for (i = 1; i <= UCHAR_MAX; i++) {
Brian Kernighan87b94932012-12-22 10:35:39 -05001105 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1106 FATAL("out of space for reg expr %.10s...", lastre);
1107 if (cc->cc_func(i)) {
1108 *bp++ = i;
1109 n++;
1110 }
1111 }
1112 } else
1113 *bp++ = c;
Martijn Dekker8a222222019-01-23 09:12:27 +00001114 } else if (c == '[' && *prestr == '.') {
1115 char collate_char;
1116 prestr++;
1117 collate_char = *prestr++;
1118 if (*prestr == '.' && prestr[1] == ']') {
1119 prestr += 2;
1120 /* Found it: map via locale TBD: for
1121 now, simply return this char. This
1122 is sufficient to pass conformance
1123 test awk.ex 156
1124 */
1125 if (*prestr == ']') {
1126 prestr++;
1127 rlxval = collate_char;
1128 return CHAR;
1129 }
1130 }
1131 } else if (c == '[' && *prestr == '=') {
1132 char equiv_char;
1133 prestr++;
1134 equiv_char = *prestr++;
1135 if (*prestr == '=' && prestr[1] == ']') {
1136 prestr += 2;
1137 /* Found it: map via locale TBD: for now
1138 simply return this char. This is
1139 sufficient to pass conformance test
1140 awk.ex 156
1141 */
1142 if (*prestr == ']') {
1143 prestr++;
1144 rlxval = equiv_char;
1145 return CHAR;
1146 }
1147 }
Brian Kernighan87b94932012-12-22 10:35:39 -05001148 } else if (c == '\0') {
1149 FATAL("nonterminated character class %.20s", lastre);
1150 } else if (bp == buf) { /* 1st char is special */
1151 *bp++ = c;
1152 } else if (c == ']') {
1153 *bp++ = 0;
1154 rlxstr = (uschar *) tostring((char *) buf);
1155 if (cflag == 0)
1156 return CCL;
1157 else
1158 return NCCL;
1159 } else
1160 *bp++ = c;
1161 }
Martijn Dekker8a222222019-01-23 09:12:27 +00001162 break;
1163 case '{':
1164 if (isdigit(*(prestr))) {
1165 num = 0; /* Process as a repetition */
1166 n = -1; m = -1;
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001167 commafound = false;
1168 digitfound = false;
Martijn Dekker8a222222019-01-23 09:12:27 +00001169 startreptok = prestr-1;
1170 /* Remember start of previous atom here ? */
1171 } else { /* just a { char, not a repetition */
1172 rlxval = c;
1173 return CHAR;
1174 }
1175 for (; ; ) {
1176 if ((c = *prestr++) == '}') {
1177 if (commafound) {
1178 if (digitfound) { /* {n,m} */
1179 m = num;
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001180 if (m < n)
Martijn Dekker8a222222019-01-23 09:12:27 +00001181 FATAL("illegal repetition expression: class %.20s",
1182 lastre);
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001183 if (n == 0 && m == 1) {
Martijn Dekker8a222222019-01-23 09:12:27 +00001184 return QUEST;
1185 }
1186 } else { /* {n,} */
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001187 if (n == 0)
1188 return STAR;
1189 else if (n == 1)
1190 return PLUS;
Martijn Dekker8a222222019-01-23 09:12:27 +00001191 }
1192 } else {
1193 if (digitfound) { /* {n} same as {n,n} */
1194 n = num;
1195 m = num;
1196 } else { /* {} */
1197 FATAL("illegal repetition expression: class %.20s",
1198 lastre);
1199 }
1200 }
1201 if (repeat(starttok, prestr-starttok, lastatom,
1202 startreptok - lastatom, n, m) > 0) {
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001203 if (n == 0 && m == 0) {
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +02001204 return ZERO;
Martijn Dekker8a222222019-01-23 09:12:27 +00001205 }
1206 /* must rescan input for next token */
1207 goto rescan;
1208 }
1209 /* Failed to replace: eat up {...} characters
1210 and treat like just PLUS */
1211 return PLUS;
1212 } else if (c == '\0') {
1213 FATAL("nonterminated character class %.20s",
1214 lastre);
1215 } else if (isdigit(c)) {
1216 num = 10 * num + c - '0';
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001217 digitfound = true;
Martijn Dekker8a222222019-01-23 09:12:27 +00001218 } else if (c == ',') {
1219 if (commafound)
1220 FATAL("illegal repetition expression: class %.20s",
1221 lastre);
1222 /* looking for {n,} or {n,m} */
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001223 commafound = true;
Martijn Dekker8a222222019-01-23 09:12:27 +00001224 n = num;
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001225 digitfound = false; /* reset */
Martijn Dekker8a222222019-01-23 09:12:27 +00001226 num = 0;
1227 } else {
1228 FATAL("illegal repetition expression: class %.20s",
1229 lastre);
1230 }
1231 }
1232 break;
Brian Kernighan87b94932012-12-22 10:35:39 -05001233 }
1234}
1235
1236int cgoto(fa *f, int s, int c)
1237{
Brian Kernighan87b94932012-12-22 10:35:39 -05001238 int *p, *q;
zoulascc16e8692019-10-17 13:04:46 -04001239 int i, j, k;
Brian Kernighan87b94932012-12-22 10:35:39 -05001240
1241 assert(c == HAT || c < NCHARS);
1242 while (f->accept >= maxsetvec) { /* guessing here! */
zoulascc16e8692019-10-17 13:04:46 -04001243 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -05001244 }
1245 for (i = 0; i <= f->accept; i++)
1246 setvec[i] = 0;
1247 setcnt = 0;
zoulascc16e8692019-10-17 13:04:46 -04001248 resize_state(f, s);
Brian Kernighan87b94932012-12-22 10:35:39 -05001249 /* compute positions of gototab[s,c] into setvec */
1250 p = f->posns[s];
1251 for (i = 1; i <= *p; i++) {
1252 if ((k = f->re[p[i]].ltype) != FINAL) {
1253 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1254 || (k == DOT && c != 0 && c != HAT)
1255 || (k == ALL && c != 0)
1256 || (k == EMPTYRE && c != 0)
1257 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1258 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1259 q = f->re[p[i]].lfollow;
1260 for (j = 1; j <= *q; j++) {
1261 if (q[j] >= maxsetvec) {
zoulascc16e8692019-10-17 13:04:46 -04001262 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -05001263 }
1264 if (setvec[q[j]] == 0) {
1265 setcnt++;
1266 setvec[q[j]] = 1;
1267 }
1268 }
1269 }
1270 }
1271 }
1272 /* determine if setvec is a previous state */
1273 tmpset[0] = setcnt;
1274 j = 1;
1275 for (i = f->accept; i >= 0; i--)
1276 if (setvec[i]) {
1277 tmpset[j++] = i;
1278 }
zoulascc16e8692019-10-17 13:04:46 -04001279 resize_state(f, f->curstat > s ? f->curstat : s);
Brian Kernighan87b94932012-12-22 10:35:39 -05001280 /* tmpset == previous state? */
1281 for (i = 1; i <= f->curstat; i++) {
1282 p = f->posns[i];
1283 if ((k = tmpset[0]) != p[0])
1284 goto different;
1285 for (j = 1; j <= k; j++)
1286 if (tmpset[j] != p[j])
1287 goto different;
1288 /* setvec is state i */
zoulascaf86dac2019-12-09 02:00:45 -05001289 if (c != HAT)
1290 f->gototab[s][c] = i;
Brian Kernighan87b94932012-12-22 10:35:39 -05001291 return i;
1292 different:;
1293 }
1294
1295 /* add tmpset to current set of states */
zoulascc16e8692019-10-17 13:04:46 -04001296 ++(f->curstat);
1297 resize_state(f, f->curstat);
Brian Kernighan87b94932012-12-22 10:35:39 -05001298 for (i = 0; i < NCHARS; i++)
1299 f->gototab[f->curstat][i] = 0;
1300 xfree(f->posns[f->curstat]);
zoulascc16e8692019-10-17 13:04:46 -04001301 p = intalloc(setcnt + 1, __func__);
Brian Kernighan87b94932012-12-22 10:35:39 -05001302
1303 f->posns[f->curstat] = p;
zoulascaf86dac2019-12-09 02:00:45 -05001304 if (c != HAT)
1305 f->gototab[s][c] = f->curstat;
Brian Kernighan87b94932012-12-22 10:35:39 -05001306 for (i = 0; i <= setcnt; i++)
1307 p[i] = tmpset[i];
1308 if (setvec[f->accept])
1309 f->out[f->curstat] = 1;
1310 else
1311 f->out[f->curstat] = 0;
1312 return f->curstat;
1313}
1314
1315
1316void freefa(fa *f) /* free a finite automaton */
1317{
1318 int i;
1319
1320 if (f == NULL)
1321 return;
zoulascc16e8692019-10-17 13:04:46 -04001322 for (i = 0; i < f->state_count; i++)
1323 xfree(f->gototab[i])
Brian Kernighan87b94932012-12-22 10:35:39 -05001324 for (i = 0; i <= f->curstat; i++)
1325 xfree(f->posns[i]);
1326 for (i = 0; i <= f->accept; i++) {
1327 xfree(f->re[i].lfollow);
1328 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001329 xfree(f->re[i].lval.np);
Brian Kernighan87b94932012-12-22 10:35:39 -05001330 }
1331 xfree(f->restr);
zoulascc16e8692019-10-17 13:04:46 -04001332 xfree(f->out);
1333 xfree(f->posns);
1334 xfree(f->gototab);
Brian Kernighan87b94932012-12-22 10:35:39 -05001335 xfree(f);
1336}