blob: 81d1cc283202bd8c62091201966a199048e38fca [file] [log] [blame]
Brian Kernighan87b94932012-12-22 10:35:39 -05001/****************************************************************
2Copyright (C) Lucent Technologies 1997
3All Rights Reserved
4
5Permission to use, copy, modify, and distribute this software and
6its documentation for any purpose and without fee is hereby
7granted, provided that the above copyright notice appear in all
8copies and that both that the copyright notice and this
9permission notice and warranty disclaimer appear in supporting
10documentation, and that the name Lucent Technologies or any of
11its entities not be used in advertising or publicity pertaining
12to distribution of the software without specific, written prior
13permission.
14
15LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22THIS SOFTWARE.
23****************************************************************/
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <string.h>
28#include <ctype.h>
29#include "awk.h"
30#include "ytab.h"
31
32extern YYSTYPE yylval;
enh-google73af0972020-02-28 03:18:29 -080033extern bool infunc;
Brian Kernighan87b94932012-12-22 10:35:39 -050034
35int lineno = 1;
36int bracecnt = 0;
37int brackcnt = 0;
38int parencnt = 0;
39
40typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44} Keyword;
45
zoulasc6a877092020-01-24 04:11:59 -050046const Keyword keywords[] = { /* keep sorted: binary searched */
Brian Kernighan87b94932012-12-22 10:35:39 -050047 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "atan2", FATAN, BLTIN },
51 { "break", BREAK, BREAK },
52 { "close", CLOSE, CLOSE },
53 { "continue", CONTINUE, CONTINUE },
54 { "cos", FCOS, BLTIN },
55 { "delete", DELETE, DELETE },
56 { "do", DO, DO },
57 { "else", ELSE, ELSE },
58 { "exit", EXIT, EXIT },
59 { "exp", FEXP, BLTIN },
60 { "fflush", FFLUSH, BLTIN },
61 { "for", FOR, FOR },
62 { "func", FUNC, FUNC },
63 { "function", FUNC, FUNC },
64 { "getline", GETLINE, GETLINE },
65 { "gsub", GSUB, GSUB },
66 { "if", IF, IF },
67 { "in", IN, IN },
68 { "index", INDEX, INDEX },
69 { "int", FINT, BLTIN },
70 { "length", FLENGTH, BLTIN },
71 { "log", FLOG, BLTIN },
72 { "match", MATCHFCN, MATCHFCN },
73 { "next", NEXT, NEXT },
74 { "nextfile", NEXTFILE, NEXTFILE },
75 { "print", PRINT, PRINT },
76 { "printf", PRINTF, PRINTF },
77 { "rand", FRAND, BLTIN },
78 { "return", RETURN, RETURN },
79 { "sin", FSIN, BLTIN },
80 { "split", SPLIT, SPLIT },
81 { "sprintf", SPRINTF, SPRINTF },
82 { "sqrt", FSQRT, BLTIN },
83 { "srand", FSRAND, BLTIN },
84 { "sub", SUB, SUB },
85 { "substr", SUBSTR, SUBSTR },
86 { "system", FSYSTEM, BLTIN },
87 { "tolower", FTOLOWER, BLTIN },
88 { "toupper", FTOUPPER, BLTIN },
89 { "while", WHILE, WHILE },
90};
91
92#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93
zoulasc6a877092020-01-24 04:11:59 -050094static int peek(void)
Brian Kernighan87b94932012-12-22 10:35:39 -050095{
96 int c = input();
97 unput(c);
98 return c;
99}
100
zoulasc6a877092020-01-24 04:11:59 -0500101static int gettok(char **pbuf, int *psz) /* get next input token */
Brian Kernighan87b94932012-12-22 10:35:39 -0500102{
103 int c, retc;
104 char *buf = *pbuf;
105 int sz = *psz;
106 char *bp = buf;
107
108 c = input();
109 if (c == 0)
110 return 0;
111 buf[0] = c;
112 buf[1] = 0;
113 if (!isalnum(c) && c != '.' && c != '_')
114 return c;
115
116 *bp++ = c;
117 if (isalpha(c) || c == '_') { /* it's a varname */
118 for ( ; (c = input()) != 0; ) {
119 if (bp-buf >= sz)
120 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121 FATAL( "out of space for name %.10s...", buf );
122 if (isalnum(c) || c == '_')
123 *bp++ = c;
124 else {
125 *bp = 0;
126 unput(c);
127 break;
128 }
129 }
130 *bp = 0;
131 retc = 'a'; /* alphanumeric */
132 } else { /* maybe it's a number, but could be . */
133 char *rem;
134 /* read input until can't be a number */
135 for ( ; (c = input()) != 0; ) {
136 if (bp-buf >= sz)
137 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138 FATAL( "out of space for number %.10s...", buf );
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600139 if (isdigit(c) || c == 'e' || c == 'E'
Brian Kernighan87b94932012-12-22 10:35:39 -0500140 || c == '.' || c == '+' || c == '-')
141 *bp++ = c;
142 else {
143 unput(c);
144 break;
145 }
146 }
147 *bp = 0;
148 strtod(buf, &rem); /* parse the number */
149 if (rem == buf) { /* it wasn't a valid number at all */
150 buf[1] = 0; /* return one character as token */
151 retc = buf[0]; /* character is its own type */
152 unputstr(rem+1); /* put rest back for later */
153 } else { /* some prefix was a number */
154 unputstr(rem); /* put rest back for later */
155 rem[0] = 0; /* truncate buf after number part */
156 retc = '0'; /* type is number */
157 }
158 }
159 *pbuf = buf;
160 *psz = sz;
161 return retc;
162}
163
164int word(char *);
165int string(void);
166int regexpr(void);
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200167bool sc = false; /* true => return a } right now */
168bool reg = false; /* true => return a REGEXPR now */
Brian Kernighan87b94932012-12-22 10:35:39 -0500169
170int yylex(void)
171{
172 int c;
pfg52421942016-06-03 21:23:11 +0000173 static char *buf = NULL;
Brian Kernighan87b94932012-12-22 10:35:39 -0500174 static int bufsize = 5; /* BUG: setting this small causes core dump! */
175
zoulasc65892082019-10-24 09:40:15 -0400176 if (buf == NULL && (buf = malloc(bufsize)) == NULL)
Brian Kernighan87b94932012-12-22 10:35:39 -0500177 FATAL( "out of space in yylex" );
178 if (sc) {
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200179 sc = false;
Brian Kernighan87b94932012-12-22 10:35:39 -0500180 RET('}');
181 }
182 if (reg) {
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200183 reg = false;
Brian Kernighan87b94932012-12-22 10:35:39 -0500184 return regexpr();
185 }
186 for (;;) {
187 c = gettok(&buf, &bufsize);
188 if (c == 0)
189 return 0;
190 if (isalpha(c) || c == '_')
191 return word(buf);
192 if (isdigit(c)) {
Arnold D. Robbinsc7eeb572020-01-05 21:18:36 +0200193 char *cp = tostring(buf);
194 yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
195 free(cp);
Brian Kernighan87b94932012-12-22 10:35:39 -0500196 /* should this also have STR set? */
197 RET(NUMBER);
198 }
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600199
Brian Kernighan87b94932012-12-22 10:35:39 -0500200 yylval.i = c;
201 switch (c) {
202 case '\n': /* {EOL} */
Cody Peter Mello6fe0a042018-09-21 11:16:27 -0700203 lineno++;
Brian Kernighan87b94932012-12-22 10:35:39 -0500204 RET(NL);
205 case '\r': /* assume \n is coming */
206 case ' ': /* {WS}+ */
207 case '\t':
208 break;
209 case '#': /* #.* strip comments */
210 while ((c = input()) != '\n' && c != 0)
211 ;
212 unput(c);
Arnold D. Robbins768d6b52020-01-31 08:54:10 +0200213 /*
214 * Next line is a hack, itcompensates for
215 * unput's treatment of \n.
216 */
217 lineno++;
Brian Kernighan87b94932012-12-22 10:35:39 -0500218 break;
219 case ';':
220 RET(';');
221 case '\\':
222 if (peek() == '\n') {
223 input();
Cody Peter Mello6fe0a042018-09-21 11:16:27 -0700224 lineno++;
Brian Kernighan87b94932012-12-22 10:35:39 -0500225 } else if (peek() == '\r') {
226 input(); input(); /* \n */
227 lineno++;
228 } else {
229 RET(c);
230 }
231 break;
232 case '&':
233 if (peek() == '&') {
234 input(); RET(AND);
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600235 } else
Brian Kernighan87b94932012-12-22 10:35:39 -0500236 RET('&');
237 case '|':
238 if (peek() == '|') {
239 input(); RET(BOR);
240 } else
241 RET('|');
242 case '!':
243 if (peek() == '=') {
244 input(); yylval.i = NE; RET(NE);
245 } else if (peek() == '~') {
246 input(); yylval.i = NOTMATCH; RET(MATCHOP);
247 } else
248 RET(NOT);
249 case '~':
250 yylval.i = MATCH;
251 RET(MATCHOP);
252 case '<':
253 if (peek() == '=') {
254 input(); yylval.i = LE; RET(LE);
255 } else {
256 yylval.i = LT; RET(LT);
257 }
258 case '=':
259 if (peek() == '=') {
260 input(); yylval.i = EQ; RET(EQ);
261 } else {
262 yylval.i = ASSIGN; RET(ASGNOP);
263 }
264 case '>':
265 if (peek() == '=') {
266 input(); yylval.i = GE; RET(GE);
267 } else if (peek() == '>') {
268 input(); yylval.i = APPEND; RET(APPEND);
269 } else {
270 yylval.i = GT; RET(GT);
271 }
272 case '+':
273 if (peek() == '+') {
274 input(); yylval.i = INCR; RET(INCR);
275 } else if (peek() == '=') {
276 input(); yylval.i = ADDEQ; RET(ASGNOP);
277 } else
278 RET('+');
279 case '-':
280 if (peek() == '-') {
281 input(); yylval.i = DECR; RET(DECR);
282 } else if (peek() == '=') {
283 input(); yylval.i = SUBEQ; RET(ASGNOP);
284 } else
285 RET('-');
286 case '*':
287 if (peek() == '=') { /* *= */
288 input(); yylval.i = MULTEQ; RET(ASGNOP);
289 } else if (peek() == '*') { /* ** or **= */
290 input(); /* eat 2nd * */
291 if (peek() == '=') {
292 input(); yylval.i = POWEQ; RET(ASGNOP);
293 } else {
294 RET(POWER);
295 }
296 } else
297 RET('*');
298 case '/':
299 RET('/');
300 case '%':
301 if (peek() == '=') {
302 input(); yylval.i = MODEQ; RET(ASGNOP);
303 } else
304 RET('%');
305 case '^':
306 if (peek() == '=') {
307 input(); yylval.i = POWEQ; RET(ASGNOP);
308 } else
309 RET(POWER);
310
311 case '$':
312 /* BUG: awkward, if not wrong */
313 c = gettok(&buf, &bufsize);
314 if (isalpha(c)) {
315 if (strcmp(buf, "NF") == 0) { /* very special */
316 unputstr("(NF)");
317 RET(INDIRECT);
318 }
319 c = peek();
320 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
321 unputstr(buf);
322 RET(INDIRECT);
323 }
324 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
325 RET(IVAR);
326 } else if (c == 0) { /* */
327 SYNTAX( "unexpected end of input after $" );
328 RET(';');
329 } else {
330 unputstr(buf);
331 RET(INDIRECT);
332 }
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600333
Brian Kernighan87b94932012-12-22 10:35:39 -0500334 case '}':
335 if (--bracecnt < 0)
336 SYNTAX( "extra }" );
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200337 sc = true;
Brian Kernighan87b94932012-12-22 10:35:39 -0500338 RET(';');
339 case ']':
340 if (--brackcnt < 0)
341 SYNTAX( "extra ]" );
342 RET(']');
343 case ')':
344 if (--parencnt < 0)
345 SYNTAX( "extra )" );
346 RET(')');
347 case '{':
348 bracecnt++;
349 RET('{');
350 case '[':
351 brackcnt++;
352 RET('[');
353 case '(':
354 parencnt++;
355 RET('(');
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600356
Brian Kernighan87b94932012-12-22 10:35:39 -0500357 case '"':
358 return string(); /* BUG: should be like tran.c ? */
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600359
Brian Kernighan87b94932012-12-22 10:35:39 -0500360 default:
361 RET(c);
362 }
363 }
364}
365
366int string(void)
367{
368 int c, n;
369 char *s, *bp;
pfg52421942016-06-03 21:23:11 +0000370 static char *buf = NULL;
Brian Kernighan87b94932012-12-22 10:35:39 -0500371 static int bufsz = 500;
372
zoulasc65892082019-10-24 09:40:15 -0400373 if (buf == NULL && (buf = malloc(bufsz)) == NULL)
Brian Kernighan87b94932012-12-22 10:35:39 -0500374 FATAL("out of space for strings");
375 for (bp = buf; (c = input()) != '"'; ) {
376 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
377 FATAL("out of space for string %.10s...", buf);
378 switch (c) {
379 case '\n':
380 case '\r':
381 case 0:
Cody Peter Mello6fe0a042018-09-21 11:16:27 -0700382 *bp = '\0';
Brian Kernighan87b94932012-12-22 10:35:39 -0500383 SYNTAX( "non-terminated string %.10s...", buf );
Brian Kernighan87b94932012-12-22 10:35:39 -0500384 if (c == 0) /* hopeless */
385 FATAL( "giving up" );
Cody Peter Mello6fe0a042018-09-21 11:16:27 -0700386 lineno++;
Brian Kernighan87b94932012-12-22 10:35:39 -0500387 break;
388 case '\\':
389 c = input();
390 switch (c) {
391 case '"': *bp++ = '"'; break;
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600392 case 'n': *bp++ = '\n'; break;
Brian Kernighan87b94932012-12-22 10:35:39 -0500393 case 't': *bp++ = '\t'; break;
394 case 'f': *bp++ = '\f'; break;
395 case 'r': *bp++ = '\r'; break;
396 case 'b': *bp++ = '\b'; break;
397 case 'v': *bp++ = '\v'; break;
Arnold D. Robbins944989b2020-01-06 00:01:46 -0700398 case 'a': *bp++ = '\a'; break;
Brian Kernighan87b94932012-12-22 10:35:39 -0500399 case '\\': *bp++ = '\\'; break;
400
401 case '0': case '1': case '2': /* octal: \d \dd \ddd */
402 case '3': case '4': case '5': case '6': case '7':
403 n = c - '0';
404 if ((c = peek()) >= '0' && c < '8') {
405 n = 8 * n + input() - '0';
406 if ((c = peek()) >= '0' && c < '8')
407 n = 8 * n + input() - '0';
408 }
409 *bp++ = n;
410 break;
411
412 case 'x': /* hex \x0-9a-fA-F + */
413 { char xbuf[100], *px;
414 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
415 if (isdigit(c)
416 || (c >= 'a' && c <= 'f')
417 || (c >= 'A' && c <= 'F'))
418 *px++ = c;
419 else
420 break;
421 }
422 *px = 0;
423 unput(c);
424 sscanf(xbuf, "%x", (unsigned int *) &n);
425 *bp++ = n;
426 break;
427 }
428
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600429 default:
Brian Kernighan87b94932012-12-22 10:35:39 -0500430 *bp++ = c;
431 break;
432 }
433 break;
434 default:
435 *bp++ = c;
436 break;
437 }
438 }
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600439 *bp = 0;
Brian Kernighan87b94932012-12-22 10:35:39 -0500440 s = tostring(buf);
Arnold D. Robbinsc7eeb572020-01-05 21:18:36 +0200441 *bp++ = ' '; *bp++ = '\0';
Brian Kernighan87b94932012-12-22 10:35:39 -0500442 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
Arnold D. Robbinsc7eeb572020-01-05 21:18:36 +0200443 free(s);
Brian Kernighan87b94932012-12-22 10:35:39 -0500444 RET(STRING);
445}
446
447
zoulasc6a877092020-01-24 04:11:59 -0500448static int binsearch(char *w, const Keyword *kp, int n)
Brian Kernighan87b94932012-12-22 10:35:39 -0500449{
450 int cond, low, mid, high;
451
452 low = 0;
453 high = n - 1;
454 while (low <= high) {
455 mid = (low + high) / 2;
456 if ((cond = strcmp(w, kp[mid].word)) < 0)
457 high = mid - 1;
458 else if (cond > 0)
459 low = mid + 1;
460 else
461 return mid;
462 }
463 return -1;
464}
465
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600466int word(char *w)
Brian Kernighan87b94932012-12-22 10:35:39 -0500467{
zoulasc6a877092020-01-24 04:11:59 -0500468 const Keyword *kp;
Brian Kernighan87b94932012-12-22 10:35:39 -0500469 int c, n;
470
471 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
Brian Kernighan87b94932012-12-22 10:35:39 -0500472 if (n != -1) { /* found in table */
Alexander Richardsonad9bd2f2019-09-10 07:54:53 +0100473 kp = keywords + n;
Brian Kernighan87b94932012-12-22 10:35:39 -0500474 yylval.i = kp->sub;
475 switch (kp->type) { /* special handling */
476 case BLTIN:
477 if (kp->sub == FSYSTEM && safe)
478 SYNTAX( "system is unsafe" );
479 RET(kp->type);
480 case FUNC:
481 if (infunc)
482 SYNTAX( "illegal nested function" );
483 RET(kp->type);
484 case RETURN:
485 if (!infunc)
486 SYNTAX( "return not in function" );
487 RET(kp->type);
488 case VARNF:
489 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
490 RET(VARNF);
491 default:
492 RET(kp->type);
493 }
494 }
495 c = peek(); /* look for '(' */
496 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
497 yylval.i = n;
498 RET(ARG);
499 } else {
500 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
501 if (c == '(') {
502 RET(CALL);
503 } else {
504 RET(VAR);
505 }
506 }
507}
508
509void startreg(void) /* next call to yylex will return a regular expression */
510{
Arnold D. Robbins108224b2019-11-10 21:19:18 +0200511 reg = true;
Brian Kernighan87b94932012-12-22 10:35:39 -0500512}
513
514int regexpr(void)
515{
516 int c;
pfg52421942016-06-03 21:23:11 +0000517 static char *buf = NULL;
Brian Kernighan87b94932012-12-22 10:35:39 -0500518 static int bufsz = 500;
519 char *bp;
520
zoulasc65892082019-10-24 09:40:15 -0400521 if (buf == NULL && (buf = malloc(bufsz)) == NULL)
Brian Kernighan87b94932012-12-22 10:35:39 -0500522 FATAL("out of space for rex expr");
523 bp = buf;
524 for ( ; (c = input()) != '/' && c != 0; ) {
525 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
526 FATAL("out of space for reg expr %.10s...", buf);
527 if (c == '\n') {
Cody Peter Mello6fe0a042018-09-21 11:16:27 -0700528 *bp = '\0';
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600529 SYNTAX( "newline in regular expression %.10s...", buf );
Brian Kernighan87b94932012-12-22 10:35:39 -0500530 unput('\n');
531 break;
532 } else if (c == '\\') {
Arnold D. Robbins795a06b2019-07-28 05:51:52 -0600533 *bp++ = '\\';
Brian Kernighan87b94932012-12-22 10:35:39 -0500534 *bp++ = input();
535 } else {
536 *bp++ = c;
537 }
538 }
539 *bp = 0;
540 if (c == 0)
541 SYNTAX("non-terminated regular expression %.10s...", buf);
542 yylval.s = tostring(buf);
543 unput('/');
544 RET(REGEXPR);
545}
546
547/* low-level lexical stuff, sort of inherited from lex */
548
549char ebuf[300];
550char *ep = ebuf;
551char yysbuf[100]; /* pushback buffer */
552char *yysptr = yysbuf;
pfg52421942016-06-03 21:23:11 +0000553FILE *yyin = NULL;
Brian Kernighan87b94932012-12-22 10:35:39 -0500554
555int input(void) /* get next lexical input character */
556{
557 int c;
558 extern char *lexprog;
559
560 if (yysptr > yysbuf)
561 c = (uschar)*--yysptr;
562 else if (lexprog != NULL) { /* awk '...' */
563 if ((c = (uschar)*lexprog) != 0)
564 lexprog++;
565 } else /* awk -f ... */
566 c = pgetc();
Cody Peter Mello6fe0a042018-09-21 11:16:27 -0700567 if (c == EOF)
Brian Kernighan87b94932012-12-22 10:35:39 -0500568 c = 0;
569 if (ep >= ebuf + sizeof ebuf)
570 ep = ebuf;
Cody Peter Mello6fe0a042018-09-21 11:16:27 -0700571 *ep = c;
572 if (c != 0) {
573 ep++;
574 }
575 return (c);
Brian Kernighan87b94932012-12-22 10:35:39 -0500576}
577
578void unput(int c) /* put lexical character back on input */
579{
zoulasc6a877092020-01-24 04:11:59 -0500580 if (c == '\n')
581 lineno--;
Brian Kernighan87b94932012-12-22 10:35:39 -0500582 if (yysptr >= yysbuf + sizeof(yysbuf))
583 FATAL("pushed back too much: %.20s...", yysbuf);
584 *yysptr++ = c;
585 if (--ep < ebuf)
586 ep = ebuf + sizeof(ebuf) - 1;
587}
588
589void unputstr(const char *s) /* put a string back on input */
590{
591 int i;
592
593 for (i = strlen(s)-1; i >= 0; i--)
594 unput(s[i]);
595}