blob: 6fb734359ae5c256d1db98d283346d2a7c418601 [file] [log] [blame]
Brian Kernighan87b94932012-12-22 10:35:39 -05001/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25/* lasciate ogne speranza, voi ch'intrate. */
26
27#define DEBUG
28
29#include <ctype.h>
Leonardo Taccari05014f52018-08-29 18:06:33 +020030#include <limits.h>
Brian Kernighan87b94932012-12-22 10:35:39 -050031#include <stdio.h>
32#include <string.h>
33#include <stdlib.h>
34#include "awk.h"
Arnold D. Robbins07f04382020-07-30 17:12:45 +030035#include "awkgram.tab.h"
Brian Kernighan87b94932012-12-22 10:35:39 -050036
Brian Kernighan87b94932012-12-22 10:35:39 -050037#define MAXLIN 22
38
39#define type(v) (v)->nobj /* badly overloaded here */
40#define info(v) (v)->ntype /* badly overloaded here */
41#define left(v) (v)->narg[0]
42#define right(v) (v)->narg[1]
43#define parent(v) (v)->nnext
44
45#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46#define ELEAF case EMPTYRE: /* empty string in regexp */
47#define UNARY case STAR: case PLUS: case QUEST:
48
49/* encoding in tree Nodes:
50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 left is index, right contains value or pointer to value
52 unary (STAR, PLUS, QUEST): left is child, right is null
53 binary (CAT, OR): left and right are children
54 parent contains pointer to parent
55*/
56
57
58int *setvec;
59int *tmpset;
60int maxsetvec = 0;
61
62int rtok; /* next token in current re */
63int rlxval;
zoulasc65892082019-10-24 09:40:15 -040064static const uschar *rlxstr;
65static const uschar *prestr; /* current position in current re */
66static const uschar *lastre; /* origin of last re */
67static const uschar *lastatom; /* origin of last Atom */
68static const uschar *starttok;
69static const uschar *basestr; /* starts with original, replaced during
Martijn Dekker8a222222019-01-23 09:12:27 +000070 repetition processing */
zoulasc65892082019-10-24 09:40:15 -040071static const uschar *firstbasestr;
Brian Kernighan87b94932012-12-22 10:35:39 -050072
73static int setcnt;
74static int poscnt;
75
zoulasc65892082019-10-24 09:40:15 -040076const char *patbeg;
Brian Kernighan87b94932012-12-22 10:35:39 -050077int patlen;
78
zoulascc16e8692019-10-17 13:04:46 -040079#define NFA 128 /* cache this many dynamic fa's */
Brian Kernighan87b94932012-12-22 10:35:39 -050080fa *fatab[NFA];
81int nfatab = 0; /* entries in fatab */
82
zoulascc16e8692019-10-17 13:04:46 -040083static int *
84intalloc(size_t n, const char *f)
85{
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +030086 int *p = (int *) calloc(n, sizeof(int));
zoulascc16e8692019-10-17 13:04:46 -040087 if (p == NULL)
88 overflo(f);
89 return p;
90}
91
92static void
93resizesetvec(const char *f)
94{
95 if (maxsetvec == 0)
96 maxsetvec = MAXLIN;
97 else
98 maxsetvec *= 4;
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +030099 setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
100 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
zoulascc16e8692019-10-17 13:04:46 -0400101 if (setvec == NULL || tmpset == NULL)
102 overflo(f);
103}
104
105static void
106resize_state(fa *f, int state)
107{
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300108 unsigned int **p;
109 uschar *p2;
110 int **p3;
zoulascc16e8692019-10-17 13:04:46 -0400111 int i, new_count;
112
113 if (++state < f->state_count)
114 return;
115
116 new_count = state + 10; /* needs to be tuned */
117
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300118 p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
zoulascc16e8692019-10-17 13:04:46 -0400119 if (p == NULL)
120 goto out;
121 f->gototab = p;
122
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300123 p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
124 if (p2 == NULL)
zoulascc16e8692019-10-17 13:04:46 -0400125 goto out;
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300126 f->out = p2;
zoulascc16e8692019-10-17 13:04:46 -0400127
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300128 p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
129 if (p3 == NULL)
zoulascc16e8692019-10-17 13:04:46 -0400130 goto out;
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300131 f->posns = p3;
zoulascc16e8692019-10-17 13:04:46 -0400132
133 for (i = f->state_count; i < new_count; ++i) {
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300134 f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
zoulascc16e8692019-10-17 13:04:46 -0400135 if (f->gototab[i] == NULL)
136 goto out;
137 f->out[i] = 0;
138 f->posns[i] = NULL;
139 }
140 f->state_count = new_count;
141 return;
142out:
143 overflo(__func__);
144}
145
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200146fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
Brian Kernighan87b94932012-12-22 10:35:39 -0500147{
148 int i, use, nuse;
149 fa *pfa;
150 static int now = 1;
151
pfg52421942016-06-03 21:23:11 +0000152 if (setvec == NULL) { /* first time through any RE */
zoulascc16e8692019-10-17 13:04:46 -0400153 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500154 }
155
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200156 if (compile_time != RUNNING) /* a constant for sure */
Brian Kernighan87b94932012-12-22 10:35:39 -0500157 return mkdfa(s, anchor);
158 for (i = 0; i < nfatab; i++) /* is it there already? */
159 if (fatab[i]->anchor == anchor
160 && strcmp((const char *) fatab[i]->restr, s) == 0) {
161 fatab[i]->use = now++;
162 return fatab[i];
163 }
164 pfa = mkdfa(s, anchor);
165 if (nfatab < NFA) { /* room for another */
166 fatab[nfatab] = pfa;
167 fatab[nfatab]->use = now++;
168 nfatab++;
169 return pfa;
170 }
171 use = fatab[0]->use; /* replace least-recently used */
172 nuse = 0;
173 for (i = 1; i < nfatab; i++)
174 if (fatab[i]->use < use) {
175 use = fatab[i]->use;
176 nuse = i;
177 }
178 freefa(fatab[nuse]);
179 fatab[nuse] = pfa;
180 pfa->use = now++;
181 return pfa;
182}
183
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200184fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200185 /* anchor = true for anchored matches, else false */
Brian Kernighan87b94932012-12-22 10:35:39 -0500186{
187 Node *p, *p1;
188 fa *f;
189
zoulasc65892082019-10-24 09:40:15 -0400190 firstbasestr = (const uschar *) s;
Martijn Dekker8a222222019-01-23 09:12:27 +0000191 basestr = firstbasestr;
Brian Kernighan87b94932012-12-22 10:35:39 -0500192 p = reparse(s);
193 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
194 /* put ALL STAR in front of reg. exp. */
195 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
196 /* put FINAL after reg. exp. */
197
198 poscnt = 0;
199 penter(p1); /* enter parent pointers and leaf indices */
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300200 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
zoulascc16e8692019-10-17 13:04:46 -0400201 overflo(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500202 f->accept = poscnt-1; /* penter has computed number of positions in re */
203 cfoll(f, p1); /* set up follow sets */
204 freetr(p1);
zoulascc16e8692019-10-17 13:04:46 -0400205 resize_state(f, 1);
206 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
207 f->posns[1] = intalloc(1, __func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500208 *f->posns[1] = 0;
209 f->initstat = makeinit(f, anchor);
210 f->anchor = anchor;
211 f->restr = (uschar *) tostring(s);
Martijn Dekker8a222222019-01-23 09:12:27 +0000212 if (firstbasestr != basestr) {
213 if (basestr)
214 xfree(basestr);
215 }
Brian Kernighan87b94932012-12-22 10:35:39 -0500216 return f;
217}
218
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200219int makeinit(fa *f, bool anchor)
Brian Kernighan87b94932012-12-22 10:35:39 -0500220{
221 int i, k;
222
223 f->curstat = 2;
224 f->out[2] = 0;
Brian Kernighan87b94932012-12-22 10:35:39 -0500225 k = *(f->re[0].lfollow);
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600226 xfree(f->posns[2]);
zoulascc16e8692019-10-17 13:04:46 -0400227 f->posns[2] = intalloc(k + 1, __func__);
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200228 for (i = 0; i <= k; i++) {
Brian Kernighan87b94932012-12-22 10:35:39 -0500229 (f->posns[2])[i] = (f->re[0].lfollow)[i];
230 }
231 if ((f->posns[2])[1] == f->accept)
232 f->out[2] = 1;
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200233 for (i = 0; i < NCHARS; i++)
Brian Kernighan87b94932012-12-22 10:35:39 -0500234 f->gototab[2][i] = 0;
235 f->curstat = cgoto(f, 2, HAT);
236 if (anchor) {
237 *f->posns[2] = k-1; /* leave out position 0 */
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200238 for (i = 0; i < k; i++) {
Brian Kernighan87b94932012-12-22 10:35:39 -0500239 (f->posns[0])[i] = (f->posns[2])[i];
240 }
241
242 f->out[0] = f->out[2];
243 if (f->curstat != 2)
244 --(*f->posns[f->curstat]);
245 }
246 return f->curstat;
247}
248
249void penter(Node *p) /* set up parent pointers and leaf indices */
250{
251 switch (type(p)) {
252 ELEAF
253 LEAF
254 info(p) = poscnt;
255 poscnt++;
256 break;
257 UNARY
258 penter(left(p));
259 parent(left(p)) = p;
260 break;
261 case CAT:
262 case OR:
263 penter(left(p));
264 penter(right(p));
265 parent(left(p)) = p;
266 parent(right(p)) = p;
267 break;
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200268 case ZERO:
269 break;
Brian Kernighan87b94932012-12-22 10:35:39 -0500270 default: /* can't happen */
271 FATAL("can't happen: unknown type %d in penter", type(p));
272 break;
273 }
274}
275
276void freetr(Node *p) /* free parse tree */
277{
278 switch (type(p)) {
279 ELEAF
280 LEAF
281 xfree(p);
282 break;
283 UNARY
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200284 case ZERO:
Brian Kernighan87b94932012-12-22 10:35:39 -0500285 freetr(left(p));
286 xfree(p);
287 break;
288 case CAT:
289 case OR:
290 freetr(left(p));
291 freetr(right(p));
292 xfree(p);
293 break;
294 default: /* can't happen */
295 FATAL("can't happen: unknown type %d in freetr", type(p));
296 break;
297 }
298}
299
300/* in the parsing of regular expressions, metacharacters like . have */
301/* to be seen literally; \056 is not a metacharacter. */
302
zoulasc65892082019-10-24 09:40:15 -0400303int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
Brian Kernighan87b94932012-12-22 10:35:39 -0500304{ /* only pick up one 8-bit byte (2 chars) */
zoulasc65892082019-10-24 09:40:15 -0400305 const uschar *p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500306 int n = 0;
307 int i;
308
zoulasc65892082019-10-24 09:40:15 -0400309 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
Brian Kernighan87b94932012-12-22 10:35:39 -0500310 if (isdigit(*p))
311 n = 16 * n + *p - '0';
312 else if (*p >= 'a' && *p <= 'f')
313 n = 16 * n + *p - 'a' + 10;
314 else if (*p >= 'A' && *p <= 'F')
315 n = 16 * n + *p - 'A' + 10;
316 }
zoulasc65892082019-10-24 09:40:15 -0400317 *pp = p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500318 return n;
319}
320
321#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
322
zoulasc65892082019-10-24 09:40:15 -0400323int quoted(const uschar **pp) /* pick up next thing after a \\ */
Brian Kernighan87b94932012-12-22 10:35:39 -0500324 /* and increment *pp */
325{
zoulasc65892082019-10-24 09:40:15 -0400326 const uschar *p = *pp;
Brian Kernighan87b94932012-12-22 10:35:39 -0500327 int c;
328
329 if ((c = *p++) == 't')
330 c = '\t';
331 else if (c == 'n')
332 c = '\n';
333 else if (c == 'f')
334 c = '\f';
335 else if (c == 'r')
336 c = '\r';
337 else if (c == 'b')
338 c = '\b';
Martijn Dekker5b602ca2019-07-26 10:46:58 +0200339 else if (c == 'v')
340 c = '\v';
341 else if (c == 'a')
342 c = '\a';
Brian Kernighan87b94932012-12-22 10:35:39 -0500343 else if (c == '\\')
344 c = '\\';
345 else if (c == 'x') { /* hexadecimal goo follows */
346 c = hexstr(&p); /* this adds a null if number is invalid */
347 } else if (isoctdigit(c)) { /* \d \dd \ddd */
348 int n = c - '0';
349 if (isoctdigit(*p)) {
350 n = 8 * n + *p++ - '0';
351 if (isoctdigit(*p))
352 n = 8 * n + *p++ - '0';
353 }
354 c = n;
355 } /* else */
356 /* c = c; */
357 *pp = p;
358 return c;
359}
360
361char *cclenter(const char *argp) /* add a character class */
362{
363 int i, c, c2;
zoulasc65892082019-10-24 09:40:15 -0400364 const uschar *op, *p = (const uschar *) argp;
365 uschar *bp;
pfg52421942016-06-03 21:23:11 +0000366 static uschar *buf = NULL;
Brian Kernighan87b94932012-12-22 10:35:39 -0500367 static int bufsz = 100;
368
369 op = p;
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300370 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
Brian Kernighan87b94932012-12-22 10:35:39 -0500371 FATAL("out of space for character class [%.10s...] 1", p);
372 bp = buf;
373 for (i = 0; (c = *p++) != 0; ) {
374 if (c == '\\') {
375 c = quoted(&p);
376 } else if (c == '-' && i > 0 && bp[-1] != 0) {
377 if (*p != 0) {
378 c = bp[-1];
379 c2 = *p++;
380 if (c2 == '\\')
381 c2 = quoted(&p);
382 if (c > c2) { /* empty; ignore */
383 bp--;
384 i--;
385 continue;
386 }
387 while (c < c2) {
388 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
389 FATAL("out of space for character class [%.10s...] 2", p);
390 *bp++ = ++c;
391 i++;
392 }
393 continue;
394 }
395 }
396 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
397 FATAL("out of space for character class [%.10s...] 3", p);
398 *bp++ = c;
399 i++;
400 }
401 *bp = 0;
Todd C. Miller292d39f2020-06-25 12:32:34 -0600402 DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
Brian Kernighan87b94932012-12-22 10:35:39 -0500403 xfree(op);
404 return (char *) tostring((char *) buf);
405}
406
407void overflo(const char *s)
408{
zoulascc16e8692019-10-17 13:04:46 -0400409 FATAL("regular expression too big: out of space in %.30s...", s);
Brian Kernighan87b94932012-12-22 10:35:39 -0500410}
411
412void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
413{
414 int i;
415 int *p;
416
417 switch (type(v)) {
418 ELEAF
419 LEAF
420 f->re[info(v)].ltype = type(v);
421 f->re[info(v)].lval.np = right(v);
422 while (f->accept >= maxsetvec) { /* guessing here! */
zoulascc16e8692019-10-17 13:04:46 -0400423 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500424 }
425 for (i = 0; i <= f->accept; i++)
426 setvec[i] = 0;
427 setcnt = 0;
428 follow(v); /* computes setvec and setcnt */
zoulascc16e8692019-10-17 13:04:46 -0400429 p = intalloc(setcnt + 1, __func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500430 f->re[info(v)].lfollow = p;
431 *p = setcnt;
432 for (i = f->accept; i >= 0; i--)
433 if (setvec[i] == 1)
434 *++p = i;
435 break;
436 UNARY
437 cfoll(f,left(v));
438 break;
439 case CAT:
440 case OR:
441 cfoll(f,left(v));
442 cfoll(f,right(v));
443 break;
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200444 case ZERO:
445 break;
Brian Kernighan87b94932012-12-22 10:35:39 -0500446 default: /* can't happen */
447 FATAL("can't happen: unknown type %d in cfoll", type(v));
448 }
449}
450
451int first(Node *p) /* collects initially active leaves of p into setvec */
452 /* returns 0 if p matches empty string */
453{
454 int b, lp;
455
456 switch (type(p)) {
457 ELEAF
458 LEAF
459 lp = info(p); /* look for high-water mark of subscripts */
460 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
zoulascc16e8692019-10-17 13:04:46 -0400461 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -0500462 }
463 if (type(p) == EMPTYRE) {
464 setvec[lp] = 0;
465 return(0);
466 }
467 if (setvec[lp] != 1) {
468 setvec[lp] = 1;
469 setcnt++;
470 }
471 if (type(p) == CCL && (*(char *) right(p)) == '\0')
472 return(0); /* empty CCL */
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200473 return(1);
Brian Kernighan87b94932012-12-22 10:35:39 -0500474 case PLUS:
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200475 if (first(left(p)) == 0)
476 return(0);
Brian Kernighan87b94932012-12-22 10:35:39 -0500477 return(1);
478 case STAR:
479 case QUEST:
480 first(left(p));
481 return(0);
482 case CAT:
483 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
484 return(1);
485 case OR:
486 b = first(right(p));
487 if (first(left(p)) == 0 || b == 0) return(0);
488 return(1);
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200489 case ZERO:
490 return 0;
Brian Kernighan87b94932012-12-22 10:35:39 -0500491 }
492 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
493 return(-1);
494}
495
496void follow(Node *v) /* collects leaves that can follow v into setvec */
497{
498 Node *p;
499
500 if (type(v) == FINAL)
501 return;
502 p = parent(v);
503 switch (type(p)) {
504 case STAR:
505 case PLUS:
506 first(v);
507 follow(p);
508 return;
509
510 case OR:
511 case QUEST:
512 follow(p);
513 return;
514
515 case CAT:
516 if (v == left(p)) { /* v is left child of p */
517 if (first(right(p)) == 0) {
518 follow(p);
519 return;
520 }
521 } else /* v is right child */
522 follow(p);
523 return;
524 }
525}
526
527int member(int c, const char *sarg) /* is c in s? */
528{
zoulasc65892082019-10-24 09:40:15 -0400529 const uschar *s = (const uschar *) sarg;
Brian Kernighan87b94932012-12-22 10:35:39 -0500530
531 while (*s)
532 if (c == *s++)
533 return(1);
534 return(0);
535}
536
537int match(fa *f, const char *p0) /* shortest match ? */
538{
539 int s, ns;
zoulasc65892082019-10-24 09:40:15 -0400540 const uschar *p = (const uschar *) p0;
Brian Kernighan87b94932012-12-22 10:35:39 -0500541
zoulascc16e8692019-10-17 13:04:46 -0400542 s = f->initstat;
543 assert (s < f->state_count);
544
Brian Kernighan87b94932012-12-22 10:35:39 -0500545 if (f->out[s])
546 return(1);
547 do {
548 /* assert(*p < NCHARS); */
549 if ((ns = f->gototab[s][*p]) != 0)
550 s = ns;
551 else
552 s = cgoto(f, s, *p);
553 if (f->out[s])
554 return(1);
555 } while (*p++ != 0);
556 return(0);
557}
558
559int pmatch(fa *f, const char *p0) /* longest match, for sub */
560{
561 int s, ns;
zoulasc65892082019-10-24 09:40:15 -0400562 const uschar *p = (const uschar *) p0;
563 const uschar *q;
Brian Kernighan87b94932012-12-22 10:35:39 -0500564
zoulascc16e8692019-10-17 13:04:46 -0400565 s = f->initstat;
566 assert(s < f->state_count);
567
zoulasc65892082019-10-24 09:40:15 -0400568 patbeg = (const char *)p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500569 patlen = -1;
570 do {
571 q = p;
572 do {
573 if (f->out[s]) /* final state */
574 patlen = q-p;
575 /* assert(*q < NCHARS); */
576 if ((ns = f->gototab[s][*q]) != 0)
577 s = ns;
578 else
579 s = cgoto(f, s, *q);
zoulascc16e8692019-10-17 13:04:46 -0400580
581 assert(s < f->state_count);
582
Brian Kernighan87b94932012-12-22 10:35:39 -0500583 if (s == 1) { /* no transition */
584 if (patlen >= 0) {
zoulasc65892082019-10-24 09:40:15 -0400585 patbeg = (const char *) p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500586 return(1);
587 }
588 else
589 goto nextin; /* no match */
590 }
591 } while (*q++ != 0);
592 if (f->out[s])
593 patlen = q-p-1; /* don't count $ */
594 if (patlen >= 0) {
zoulasc65892082019-10-24 09:40:15 -0400595 patbeg = (const char *) p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500596 return(1);
597 }
598 nextin:
599 s = 2;
zoulascc16e8692019-10-17 13:04:46 -0400600 } while (*p++);
Brian Kernighan87b94932012-12-22 10:35:39 -0500601 return (0);
602}
603
604int nematch(fa *f, const char *p0) /* non-empty match, for sub */
605{
606 int s, ns;
zoulasc65892082019-10-24 09:40:15 -0400607 const uschar *p = (const uschar *) p0;
608 const uschar *q;
Brian Kernighan87b94932012-12-22 10:35:39 -0500609
zoulascc16e8692019-10-17 13:04:46 -0400610 s = f->initstat;
611 assert(s < f->state_count);
612
zoulasc65892082019-10-24 09:40:15 -0400613 patbeg = (const char *)p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500614 patlen = -1;
615 while (*p) {
616 q = p;
617 do {
618 if (f->out[s]) /* final state */
619 patlen = q-p;
620 /* assert(*q < NCHARS); */
621 if ((ns = f->gototab[s][*q]) != 0)
622 s = ns;
623 else
624 s = cgoto(f, s, *q);
625 if (s == 1) { /* no transition */
626 if (patlen > 0) {
zoulasc65892082019-10-24 09:40:15 -0400627 patbeg = (const char *) p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500628 return(1);
629 } else
630 goto nnextin; /* no nonempty match */
631 }
632 } while (*q++ != 0);
633 if (f->out[s])
634 patlen = q-p-1; /* don't count $ */
635 if (patlen > 0 ) {
zoulasc65892082019-10-24 09:40:15 -0400636 patbeg = (const char *) p;
Brian Kernighan87b94932012-12-22 10:35:39 -0500637 return(1);
638 }
639 nnextin:
640 s = 2;
Brian Kernighan87b94932012-12-22 10:35:39 -0500641 p++;
642 }
643 return (0);
644}
645
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300646
647/*
648 * NAME
649 * fnematch
650 *
651 * DESCRIPTION
652 * A stream-fed version of nematch which transfers characters to a
653 * null-terminated buffer. All characters up to and including the last
654 * character of the matching text or EOF are placed in the buffer. If
655 * a match is found, patbeg and patlen are set appropriately.
656 *
657 * RETURN VALUES
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200658 * false No match found.
659 * true Match found.
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300660 */
661
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200662bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300663{
Arnold D. Robbins5ff28202019-10-07 15:50:53 +0300664 char *buf = *pbuf;
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300665 int bufsize = *pbufsize;
666 int c, i, j, k, ns, s;
667
668 s = pfa->initstat;
669 patlen = 0;
670
671 /*
672 * All indices relative to buf.
673 * i <= j <= k <= bufsize
674 *
675 * i: origin of active substring
676 * j: current character
677 * k: destination of next getc()
678 */
679 i = -1, k = 0;
680 do {
681 j = i++;
682 do {
683 if (++j == k) {
684 if (k == bufsize)
685 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
686 FATAL("stream '%.30s...' too long", buf);
687 buf[k++] = (c = getc(f)) != EOF ? c : 0;
688 }
Todd C. Miller22ee26b2020-07-29 12:27:45 -0600689 c = (uschar)buf[j];
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300690 /* assert(c < NCHARS); */
691
692 if ((ns = pfa->gototab[s][c]) != 0)
693 s = ns;
694 else
695 s = cgoto(pfa, s, c);
696
697 if (pfa->out[s]) { /* final state */
698 patlen = j - i + 1;
699 if (c == 0) /* don't count $ */
700 patlen--;
701 }
702 } while (buf[j] && s != 1);
703 s = 2;
704 } while (buf[i] && !patlen);
705
706 /* adjbuf() may have relocated a resized buffer. Inform the world. */
707 *pbuf = buf;
708 *pbufsize = bufsize;
709
710 if (patlen) {
711 patbeg = (char *) buf + i;
712 /*
713 * Under no circumstances is the last character fed to
714 * the automaton part of the match. It is EOF's nullbyte,
715 * or it sent the automaton into a state with no further
716 * transitions available (s==1), or both. Room for a
717 * terminating nullbyte is guaranteed.
718 *
719 * ungetc any chars after the end of matching text
720 * (except for EOF's nullbyte, if present) and null
721 * terminate the buffer.
722 */
723 do
724 if (buf[--k] && ungetc(buf[k], f) == EOF)
725 FATAL("unable to ungetc '%c'", buf[k]);
726 while (k > i + patlen);
Arnold D. Robbins140802c2020-01-01 22:42:50 +0200727 buf[k] = '\0';
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200728 return true;
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300729 }
730 else
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200731 return false;
Arnold D. Robbins643a5a32019-07-28 20:12:05 +0300732}
733
Brian Kernighan87b94932012-12-22 10:35:39 -0500734Node *reparse(const char *p) /* parses regular expression pointed to by p */
735{ /* uses relex() to scan regular expression */
736 Node *np;
737
Todd C. Miller292d39f2020-06-25 12:32:34 -0600738 DPRINTF("reparse <%s>\n", p);
zoulasc65892082019-10-24 09:40:15 -0400739 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
Brian Kernighan87b94932012-12-22 10:35:39 -0500740 rtok = relex();
741 /* GNU compatibility: an empty regexp matches anything */
742 if (rtok == '\0') {
743 /* FATAL("empty regular expression"); previous */
744 return(op2(EMPTYRE, NIL, NIL));
745 }
746 np = regexp();
747 if (rtok != '\0')
748 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
749 return(np);
750}
751
752Node *regexp(void) /* top-level parse of reg expr */
753{
754 return (alt(concat(primary())));
755}
756
757Node *primary(void)
758{
759 Node *np;
Martijn Dekker8a222222019-01-23 09:12:27 +0000760 int savelastatom;
Brian Kernighan87b94932012-12-22 10:35:39 -0500761
762 switch (rtok) {
763 case CHAR:
Martijn Dekker8a222222019-01-23 09:12:27 +0000764 lastatom = starttok;
Brian Kernighan87b94932012-12-22 10:35:39 -0500765 np = op2(CHAR, NIL, itonp(rlxval));
766 rtok = relex();
767 return (unary(np));
768 case ALL:
769 rtok = relex();
770 return (unary(op2(ALL, NIL, NIL)));
771 case EMPTYRE:
772 rtok = relex();
Martijn Dekker8a222222019-01-23 09:12:27 +0000773 return (unary(op2(EMPTYRE, NIL, NIL)));
Brian Kernighan87b94932012-12-22 10:35:39 -0500774 case DOT:
Martijn Dekker8a222222019-01-23 09:12:27 +0000775 lastatom = starttok;
Brian Kernighan87b94932012-12-22 10:35:39 -0500776 rtok = relex();
777 return (unary(op2(DOT, NIL, NIL)));
778 case CCL:
zoulasc65892082019-10-24 09:40:15 -0400779 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
Martijn Dekker8a222222019-01-23 09:12:27 +0000780 lastatom = starttok;
Brian Kernighan87b94932012-12-22 10:35:39 -0500781 rtok = relex();
782 return (unary(np));
783 case NCCL:
zoulasc65892082019-10-24 09:40:15 -0400784 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
Martijn Dekker8a222222019-01-23 09:12:27 +0000785 lastatom = starttok;
Brian Kernighan87b94932012-12-22 10:35:39 -0500786 rtok = relex();
787 return (unary(np));
788 case '^':
789 rtok = relex();
790 return (unary(op2(CHAR, NIL, itonp(HAT))));
791 case '$':
792 rtok = relex();
793 return (unary(op2(CHAR, NIL, NIL)));
794 case '(':
Martijn Dekker8a222222019-01-23 09:12:27 +0000795 lastatom = starttok;
796 savelastatom = starttok - basestr; /* Retain over recursion */
Brian Kernighan87b94932012-12-22 10:35:39 -0500797 rtok = relex();
798 if (rtok == ')') { /* special pleading for () */
799 rtok = relex();
800 return unary(op2(CCL, NIL, (Node *) tostring("")));
801 }
802 np = regexp();
803 if (rtok == ')') {
Martijn Dekker8a222222019-01-23 09:12:27 +0000804 lastatom = basestr + savelastatom; /* Restore */
Brian Kernighan87b94932012-12-22 10:35:39 -0500805 rtok = relex();
806 return (unary(np));
807 }
808 else
809 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
810 default:
811 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
812 }
813 return 0; /*NOTREACHED*/
814}
815
816Node *concat(Node *np)
817{
818 switch (rtok) {
Martijn Dekker8a222222019-01-23 09:12:27 +0000819 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
Brian Kernighan87b94932012-12-22 10:35:39 -0500820 return (concat(op2(CAT, np, primary())));
Martijn Dekker8a222222019-01-23 09:12:27 +0000821 case EMPTYRE:
822 rtok = relex();
823 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
824 primary())));
Brian Kernighan87b94932012-12-22 10:35:39 -0500825 }
826 return (np);
827}
828
829Node *alt(Node *np)
830{
831 if (rtok == OR) {
832 rtok = relex();
833 return (alt(op2(OR, np, concat(primary()))));
834 }
835 return (np);
836}
837
838Node *unary(Node *np)
839{
840 switch (rtok) {
841 case STAR:
842 rtok = relex();
843 return (unary(op2(STAR, np, NIL)));
844 case PLUS:
845 rtok = relex();
846 return (unary(op2(PLUS, np, NIL)));
847 case QUEST:
848 rtok = relex();
849 return (unary(op2(QUEST, np, NIL)));
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +0200850 case ZERO:
851 rtok = relex();
852 return (unary(op2(ZERO, np, NIL)));
Brian Kernighan87b94932012-12-22 10:35:39 -0500853 default:
854 return (np);
855 }
856}
857
858/*
859 * Character class definitions conformant to the POSIX locale as
860 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
861 * and operating character sets are both ASCII (ISO646) or supersets
862 * thereof.
863 *
864 * Note that to avoid overflowing the temporary buffer used in
865 * relex(), the expanded character class (prior to range expansion)
866 * must be less than twice the size of their full name.
867 */
868
869/* Because isblank doesn't show up in any of the header files on any
870 * system i use, it's defined here. if some other locale has a richer
871 * definition of "blank", define HAS_ISBLANK and provide your own
872 * version.
873 * the parentheses here are an attempt to find a path through the maze
874 * of macro definition and/or function and/or version provided. thanks
875 * to nelson beebe for the suggestion; let's see if it works everywhere.
876 */
877
878/* #define HAS_ISBLANK */
879#ifndef HAS_ISBLANK
880
881int (xisblank)(int c)
882{
883 return c==' ' || c=='\t';
884}
885
886#endif
887
zoulasc6a877092020-01-24 04:11:59 -0500888static const struct charclass {
Brian Kernighan87b94932012-12-22 10:35:39 -0500889 const char *cc_name;
890 int cc_namelen;
891 int (*cc_func)(int);
892} charclasses[] = {
893 { "alnum", 5, isalnum },
894 { "alpha", 5, isalpha },
895#ifndef HAS_ISBLANK
Arnold D. Robbins32093f52018-08-22 20:40:26 +0300896 { "blank", 5, xisblank },
Brian Kernighan87b94932012-12-22 10:35:39 -0500897#else
898 { "blank", 5, isblank },
899#endif
900 { "cntrl", 5, iscntrl },
901 { "digit", 5, isdigit },
902 { "graph", 5, isgraph },
903 { "lower", 5, islower },
904 { "print", 5, isprint },
905 { "punct", 5, ispunct },
906 { "space", 5, isspace },
907 { "upper", 5, isupper },
908 { "xdigit", 6, isxdigit },
909 { NULL, 0, NULL },
910};
911
Martijn Dekker8a222222019-01-23 09:12:27 +0000912#define REPEAT_SIMPLE 0
913#define REPEAT_PLUS_APPENDED 1
914#define REPEAT_WITH_Q 2
915#define REPEAT_ZERO 3
916
917static int
918replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
919 int atomlen, int firstnum, int secondnum, int special_case)
920{
921 int i, j;
922 uschar *buf = 0;
923 int ret = 1;
Arnold D. Robbins944989b2020-01-06 00:01:46 -0700924 int init_q = (firstnum == 0); /* first added char will be ? */
Martijn Dekker8a222222019-01-23 09:12:27 +0000925 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
926 int prefix_length = reptok - basestr; /* prefix includes first rep */
zoulasc65892082019-10-24 09:40:15 -0400927 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
Martijn Dekker8a222222019-01-23 09:12:27 +0000928 int size = prefix_length + suffix_length;
929
930 if (firstnum > 1) { /* add room for reps 2 through firstnum */
931 size += atomlen*(firstnum-1);
932 }
933
934 /* Adjust size of buffer for special cases */
935 if (special_case == REPEAT_PLUS_APPENDED) {
936 size++; /* for the final + */
937 } else if (special_case == REPEAT_WITH_Q) {
Todd C. Millerd54b7032021-03-02 12:58:50 -0700938 size += init_q + (atomlen+1)* (n_q_reps-init_q);
Martijn Dekker8a222222019-01-23 09:12:27 +0000939 } else if (special_case == REPEAT_ZERO) {
940 size += 2; /* just a null ERE: () */
941 }
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +0300942 if ((buf = (uschar *) malloc(size + 1)) == NULL)
Martijn Dekker8a222222019-01-23 09:12:27 +0000943 FATAL("out of space in reg expr %.10s..", lastre);
944 memcpy(buf, basestr, prefix_length); /* copy prefix */
945 j = prefix_length;
946 if (special_case == REPEAT_ZERO) {
947 j -= atomlen;
948 buf[j++] = '(';
949 buf[j++] = ')';
950 }
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +0200951 for (i = 1; i < firstnum; i++) { /* copy x reps */
Martijn Dekker8a222222019-01-23 09:12:27 +0000952 memcpy(&buf[j], atom, atomlen);
953 j += atomlen;
954 }
955 if (special_case == REPEAT_PLUS_APPENDED) {
956 buf[j++] = '+';
957 } else if (special_case == REPEAT_WITH_Q) {
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200958 if (init_q)
959 buf[j++] = '?';
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +0200960 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
Martijn Dekker8a222222019-01-23 09:12:27 +0000961 memcpy(&buf[j], atom, atomlen);
962 j += atomlen;
963 buf[j++] = '?';
964 }
965 }
966 memcpy(&buf[j], reptok+reptoklen, suffix_length);
Todd C. Millerd54b7032021-03-02 12:58:50 -0700967 j += suffix_length;
968 buf[j] = '\0';
Martijn Dekker8a222222019-01-23 09:12:27 +0000969 /* free old basestr */
970 if (firstbasestr != basestr) {
971 if (basestr)
972 xfree(basestr);
973 }
974 basestr = buf;
975 prestr = buf + prefix_length;
976 if (special_case == REPEAT_ZERO) {
977 prestr -= atomlen;
978 ret++;
979 }
980 return ret;
981}
982
983static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
984 int atomlen, int firstnum, int secondnum)
985{
986 /*
987 In general, the repetition specifier or "bound" is replaced here
988 by an equivalent ERE string, repeating the immediately previous atom
989 and appending ? and + as needed. Note that the first copy of the
990 atom is left in place, except in the special_case of a zero-repeat
991 (i.e., {0}).
992 */
993 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
994 if (firstnum < 2) {
995 /* 0 or 1: should be handled before you get here */
Martijn Dekker0619d5d2019-02-21 22:38:16 +0100996 FATAL("internal error");
Martijn Dekker8a222222019-01-23 09:12:27 +0000997 } else {
998 return replace_repeat(reptok, reptoklen, atom, atomlen,
999 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1000 }
1001 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1002 if (firstnum == 0) { /* {0} or {0,0} */
1003 /* This case is unusual because the resulting
1004 replacement string might actually be SMALLER than
1005 the original ERE */
1006 return replace_repeat(reptok, reptoklen, atom, atomlen,
1007 firstnum, secondnum, REPEAT_ZERO);
1008 } else { /* (firstnum >= 1) */
1009 return replace_repeat(reptok, reptoklen, atom, atomlen,
1010 firstnum, secondnum, REPEAT_SIMPLE);
1011 }
1012 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1013 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1014 return replace_repeat(reptok, reptoklen, atom, atomlen,
1015 firstnum, secondnum, REPEAT_WITH_Q);
1016 } else { /* Error - shouldn't be here (n>m) */
Martijn Dekker0619d5d2019-02-21 22:38:16 +01001017 FATAL("internal error");
Martijn Dekker8a222222019-01-23 09:12:27 +00001018 }
1019 return 0;
1020}
Brian Kernighan87b94932012-12-22 10:35:39 -05001021
1022int relex(void) /* lexical analyzer for reparse */
1023{
1024 int c, n;
1025 int cflag;
pfg52421942016-06-03 21:23:11 +00001026 static uschar *buf = NULL;
Brian Kernighan87b94932012-12-22 10:35:39 -05001027 static int bufsz = 100;
1028 uschar *bp;
zoulasc6a877092020-01-24 04:11:59 -05001029 const struct charclass *cc;
Brian Kernighan87b94932012-12-22 10:35:39 -05001030 int i;
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001031 int num, m;
1032 bool commafound, digitfound;
Martijn Dekker8a222222019-01-23 09:12:27 +00001033 const uschar *startreptok;
Arnold D. Robbins28dacbd2019-06-04 23:53:31 -06001034 static int parens = 0;
Martijn Dekker8a222222019-01-23 09:12:27 +00001035
1036rescan:
1037 starttok = prestr;
Brian Kernighan87b94932012-12-22 10:35:39 -05001038
1039 switch (c = *prestr++) {
1040 case '|': return OR;
1041 case '*': return STAR;
1042 case '+': return PLUS;
1043 case '?': return QUEST;
1044 case '.': return DOT;
1045 case '\0': prestr--; return '\0';
1046 case '^':
1047 case '$':
Brian Kernighan87b94932012-12-22 10:35:39 -05001048 return c;
Arnold D. Robbins28dacbd2019-06-04 23:53:31 -06001049 case '(':
1050 parens++;
1051 return c;
1052 case ')':
1053 if (parens) {
1054 parens--;
1055 return c;
1056 }
1057 /* unmatched close parenthesis; per POSIX, treat as literal */
1058 rlxval = c;
1059 return CHAR;
Brian Kernighan87b94932012-12-22 10:35:39 -05001060 case '\\':
1061 rlxval = quoted(&prestr);
1062 return CHAR;
1063 default:
1064 rlxval = c;
1065 return CHAR;
Arnold D. Robbins795a06b2019-07-28 05:51:52 -06001066 case '[':
Arnold D. Robbins3b42cfa2020-10-13 20:52:43 +03001067 if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
Brian Kernighan87b94932012-12-22 10:35:39 -05001068 FATAL("out of space in reg expr %.10s..", lastre);
1069 bp = buf;
1070 if (*prestr == '^') {
1071 cflag = 1;
1072 prestr++;
1073 }
1074 else
1075 cflag = 0;
1076 n = 2 * strlen((const char *) prestr)+1;
1077 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1078 FATAL("out of space for reg expr %.10s...", lastre);
1079 for (; ; ) {
1080 if ((c = *prestr++) == '\\') {
1081 *bp++ = '\\';
1082 if ((c = *prestr++) == '\0')
1083 FATAL("nonterminated character class %.20s...", lastre);
1084 *bp++ = c;
1085 /* } else if (c == '\n') { */
1086 /* FATAL("newline in character class %.20s...", lastre); */
1087 } else if (c == '[' && *prestr == ':') {
1088 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1089 for (cc = charclasses; cc->cc_name; cc++)
1090 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1091 break;
1092 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1093 prestr[2 + cc->cc_namelen] == ']') {
1094 prestr += cc->cc_namelen + 3;
Cody Peter Melloa6392ef2018-11-12 10:25:44 -08001095 /*
1096 * BUG: We begin at 1, instead of 0, since we
1097 * would otherwise prematurely terminate the
1098 * string for classes like [[:cntrl:]]. This
1099 * means that we can't match the NUL character,
1100 * not without first adapting the entire
1101 * program to track each string's length.
1102 */
Leonardo Taccari031aac82019-01-28 17:34:58 +01001103 for (i = 1; i <= UCHAR_MAX; i++) {
Anonymous AWK fan40f05272021-10-09 19:23:05 +01001104 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
Brian Kernighan87b94932012-12-22 10:35:39 -05001105 FATAL("out of space for reg expr %.10s...", lastre);
1106 if (cc->cc_func(i)) {
awkfan77e5a89e62020-06-25 19:33:52 +01001107 /* escape backslash */
1108 if (i == '\\') {
1109 *bp++ = '\\';
1110 n++;
1111 }
1112
Brian Kernighan87b94932012-12-22 10:35:39 -05001113 *bp++ = i;
1114 n++;
1115 }
1116 }
1117 } else
1118 *bp++ = c;
Martijn Dekker8a222222019-01-23 09:12:27 +00001119 } else if (c == '[' && *prestr == '.') {
1120 char collate_char;
1121 prestr++;
1122 collate_char = *prestr++;
1123 if (*prestr == '.' && prestr[1] == ']') {
1124 prestr += 2;
1125 /* Found it: map via locale TBD: for
1126 now, simply return this char. This
1127 is sufficient to pass conformance
1128 test awk.ex 156
1129 */
1130 if (*prestr == ']') {
1131 prestr++;
1132 rlxval = collate_char;
1133 return CHAR;
1134 }
1135 }
1136 } else if (c == '[' && *prestr == '=') {
1137 char equiv_char;
1138 prestr++;
1139 equiv_char = *prestr++;
1140 if (*prestr == '=' && prestr[1] == ']') {
1141 prestr += 2;
1142 /* Found it: map via locale TBD: for now
1143 simply return this char. This is
1144 sufficient to pass conformance test
1145 awk.ex 156
1146 */
1147 if (*prestr == ']') {
1148 prestr++;
1149 rlxval = equiv_char;
1150 return CHAR;
1151 }
1152 }
Brian Kernighan87b94932012-12-22 10:35:39 -05001153 } else if (c == '\0') {
1154 FATAL("nonterminated character class %.20s", lastre);
1155 } else if (bp == buf) { /* 1st char is special */
1156 *bp++ = c;
1157 } else if (c == ']') {
1158 *bp++ = 0;
1159 rlxstr = (uschar *) tostring((char *) buf);
1160 if (cflag == 0)
1161 return CCL;
1162 else
1163 return NCCL;
1164 } else
1165 *bp++ = c;
1166 }
Martijn Dekker8a222222019-01-23 09:12:27 +00001167 break;
1168 case '{':
1169 if (isdigit(*(prestr))) {
1170 num = 0; /* Process as a repetition */
1171 n = -1; m = -1;
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001172 commafound = false;
1173 digitfound = false;
Martijn Dekker8a222222019-01-23 09:12:27 +00001174 startreptok = prestr-1;
1175 /* Remember start of previous atom here ? */
1176 } else { /* just a { char, not a repetition */
1177 rlxval = c;
1178 return CHAR;
1179 }
1180 for (; ; ) {
1181 if ((c = *prestr++) == '}') {
1182 if (commafound) {
1183 if (digitfound) { /* {n,m} */
1184 m = num;
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001185 if (m < n)
Martijn Dekker8a222222019-01-23 09:12:27 +00001186 FATAL("illegal repetition expression: class %.20s",
1187 lastre);
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001188 if (n == 0 && m == 1) {
Martijn Dekker8a222222019-01-23 09:12:27 +00001189 return QUEST;
1190 }
1191 } else { /* {n,} */
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001192 if (n == 0)
1193 return STAR;
1194 else if (n == 1)
1195 return PLUS;
Martijn Dekker8a222222019-01-23 09:12:27 +00001196 }
1197 } else {
1198 if (digitfound) { /* {n} same as {n,n} */
1199 n = num;
1200 m = num;
1201 } else { /* {} */
1202 FATAL("illegal repetition expression: class %.20s",
1203 lastre);
1204 }
1205 }
1206 if (repeat(starttok, prestr-starttok, lastatom,
1207 startreptok - lastatom, n, m) > 0) {
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001208 if (n == 0 && m == 0) {
Arnold D. Robbinsa3e9e822020-01-01 22:47:29 +02001209 return ZERO;
Martijn Dekker8a222222019-01-23 09:12:27 +00001210 }
1211 /* must rescan input for next token */
1212 goto rescan;
1213 }
1214 /* Failed to replace: eat up {...} characters
1215 and treat like just PLUS */
1216 return PLUS;
1217 } else if (c == '\0') {
1218 FATAL("nonterminated character class %.20s",
1219 lastre);
1220 } else if (isdigit(c)) {
1221 num = 10 * num + c - '0';
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001222 digitfound = true;
Martijn Dekker8a222222019-01-23 09:12:27 +00001223 } else if (c == ',') {
1224 if (commafound)
1225 FATAL("illegal repetition expression: class %.20s",
1226 lastre);
1227 /* looking for {n,} or {n,m} */
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001228 commafound = true;
Martijn Dekker8a222222019-01-23 09:12:27 +00001229 n = num;
Arnold D. Robbins108224b2019-11-10 21:19:18 +02001230 digitfound = false; /* reset */
Martijn Dekker8a222222019-01-23 09:12:27 +00001231 num = 0;
1232 } else {
1233 FATAL("illegal repetition expression: class %.20s",
1234 lastre);
1235 }
1236 }
1237 break;
Brian Kernighan87b94932012-12-22 10:35:39 -05001238 }
1239}
1240
1241int cgoto(fa *f, int s, int c)
1242{
Brian Kernighan87b94932012-12-22 10:35:39 -05001243 int *p, *q;
zoulascc16e8692019-10-17 13:04:46 -04001244 int i, j, k;
Brian Kernighan87b94932012-12-22 10:35:39 -05001245
1246 assert(c == HAT || c < NCHARS);
1247 while (f->accept >= maxsetvec) { /* guessing here! */
zoulascc16e8692019-10-17 13:04:46 -04001248 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -05001249 }
1250 for (i = 0; i <= f->accept; i++)
1251 setvec[i] = 0;
1252 setcnt = 0;
zoulascc16e8692019-10-17 13:04:46 -04001253 resize_state(f, s);
Brian Kernighan87b94932012-12-22 10:35:39 -05001254 /* compute positions of gototab[s,c] into setvec */
1255 p = f->posns[s];
1256 for (i = 1; i <= *p; i++) {
1257 if ((k = f->re[p[i]].ltype) != FINAL) {
1258 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1259 || (k == DOT && c != 0 && c != HAT)
1260 || (k == ALL && c != 0)
1261 || (k == EMPTYRE && c != 0)
1262 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1263 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1264 q = f->re[p[i]].lfollow;
1265 for (j = 1; j <= *q; j++) {
1266 if (q[j] >= maxsetvec) {
zoulascc16e8692019-10-17 13:04:46 -04001267 resizesetvec(__func__);
Brian Kernighan87b94932012-12-22 10:35:39 -05001268 }
1269 if (setvec[q[j]] == 0) {
1270 setcnt++;
1271 setvec[q[j]] = 1;
1272 }
1273 }
1274 }
1275 }
1276 }
1277 /* determine if setvec is a previous state */
1278 tmpset[0] = setcnt;
1279 j = 1;
1280 for (i = f->accept; i >= 0; i--)
1281 if (setvec[i]) {
1282 tmpset[j++] = i;
1283 }
zoulascc16e8692019-10-17 13:04:46 -04001284 resize_state(f, f->curstat > s ? f->curstat : s);
Brian Kernighan87b94932012-12-22 10:35:39 -05001285 /* tmpset == previous state? */
1286 for (i = 1; i <= f->curstat; i++) {
1287 p = f->posns[i];
1288 if ((k = tmpset[0]) != p[0])
1289 goto different;
1290 for (j = 1; j <= k; j++)
1291 if (tmpset[j] != p[j])
1292 goto different;
1293 /* setvec is state i */
zoulascaf86dac2019-12-09 02:00:45 -05001294 if (c != HAT)
1295 f->gototab[s][c] = i;
Brian Kernighan87b94932012-12-22 10:35:39 -05001296 return i;
1297 different:;
1298 }
1299
1300 /* add tmpset to current set of states */
zoulascc16e8692019-10-17 13:04:46 -04001301 ++(f->curstat);
1302 resize_state(f, f->curstat);
Brian Kernighan87b94932012-12-22 10:35:39 -05001303 for (i = 0; i < NCHARS; i++)
1304 f->gototab[f->curstat][i] = 0;
1305 xfree(f->posns[f->curstat]);
zoulascc16e8692019-10-17 13:04:46 -04001306 p = intalloc(setcnt + 1, __func__);
Brian Kernighan87b94932012-12-22 10:35:39 -05001307
1308 f->posns[f->curstat] = p;
zoulascaf86dac2019-12-09 02:00:45 -05001309 if (c != HAT)
1310 f->gototab[s][c] = f->curstat;
Brian Kernighan87b94932012-12-22 10:35:39 -05001311 for (i = 0; i <= setcnt; i++)
1312 p[i] = tmpset[i];
1313 if (setvec[f->accept])
1314 f->out[f->curstat] = 1;
1315 else
1316 f->out[f->curstat] = 0;
1317 return f->curstat;
1318}
1319
1320
1321void freefa(fa *f) /* free a finite automaton */
1322{
1323 int i;
1324
1325 if (f == NULL)
1326 return;
zoulascc16e8692019-10-17 13:04:46 -04001327 for (i = 0; i < f->state_count; i++)
1328 xfree(f->gototab[i])
Brian Kernighan87b94932012-12-22 10:35:39 -05001329 for (i = 0; i <= f->curstat; i++)
1330 xfree(f->posns[i]);
1331 for (i = 0; i <= f->accept; i++) {
1332 xfree(f->re[i].lfollow);
1333 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
Arnold D. Robbins7db55ba2019-12-27 12:02:52 +02001334 xfree(f->re[i].lval.np);
Brian Kernighan87b94932012-12-22 10:35:39 -05001335 }
1336 xfree(f->restr);
zoulascc16e8692019-10-17 13:04:46 -04001337 xfree(f->out);
1338 xfree(f->posns);
1339 xfree(f->gototab);
Brian Kernighan87b94932012-12-22 10:35:39 -05001340 xfree(f);
1341}