blob: 3cb1e33696f4a3429aa9da68fa8cf4801ef45c0e [file] [log] [blame]
Andy Greencc13c6f2013-11-09 10:09:09 +08001/*
2 * minilex.c
3 *
4 * High efficiency lexical state parser
5 *
Andy Greenbbc5c072014-03-09 11:49:21 +08006 * Copyright (C)2011-2014 Andy Green <andy@warmcat.com>
Andy Greencc13c6f2013-11-09 10:09:09 +08007 *
8 * Licensed under LGPL2
9 *
10 * Usage: gcc minilex.c -o minilex && ./minilex > lextable.h
11 *
12 * Run it twice to test parsing on the generated table on stderr
13 */
14
Andy Green6d1fcb72013-01-18 01:55:48 +080015#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18
Andy Green917f43a2014-10-12 14:31:47 +080019#include "lextable-strings.h"
Andy Green6d1fcb72013-01-18 01:55:48 +080020
Andy Greenbbc5c072014-03-09 11:49:21 +080021/*
22 * b7 = 0 = 1-byte seq
23 * 0x08 = fail
24 * 2-byte seq
Andy Green40110e82015-12-14 08:52:03 +080025 * 0x00 - 0x07, then terminal as given in 2nd byte
Andy Greenbbc5c072014-03-09 11:49:21 +080026 3-byte seq
Andy Green40110e82015-12-14 08:52:03 +080027 * no match: go fwd 3 byte, match: jump fwd by amt in +1/+2 bytes
Andy Greenbbc5c072014-03-09 11:49:21 +080028 * = 1 = 1-byte seq
Andy Green40110e82015-12-14 08:52:03 +080029 * no match: die, match go fwd 1 byte
Andy Greenbbc5c072014-03-09 11:49:21 +080030 */
31
Andy Green6d1fcb72013-01-18 01:55:48 +080032unsigned char lextable[] = {
Andy Greencc13c6f2013-11-09 10:09:09 +080033 #include "lextable.h"
Andy Green6d1fcb72013-01-18 01:55:48 +080034};
35
36#define PARALLEL 30
37
38struct state {
39 char c[PARALLEL];
40 int state[PARALLEL];
41 int count;
42 int bytepos;
Andy Greenbbc5c072014-03-09 11:49:21 +080043
44 int real_pos;
Andy Green6d1fcb72013-01-18 01:55:48 +080045};
46
47struct state state[1000];
48int next = 1;
49
Andy Greenbbc5c072014-03-09 11:49:21 +080050#define FAIL_CHAR 0x08
Andy Greencc13c6f2013-11-09 10:09:09 +080051
Andy Green6d1fcb72013-01-18 01:55:48 +080052int lextable_decode(int pos, char c)
53{
54 while (1) {
Andy Greenbbc5c072014-03-09 11:49:21 +080055 if (lextable[pos] & (1 << 7)) { /* 1-byte, fail on mismatch */
56 if ((lextable[pos] & 0x7f) != c)
57 return -1;
58 /* fall thru */
59 pos++;
60 if (lextable[pos] == FAIL_CHAR)
61 return -1;
Andy Green6d1fcb72013-01-18 01:55:48 +080062 return pos;
Andy Greenbbc5c072014-03-09 11:49:21 +080063 } else { /* b7 = 0, end or 3-byte */
64 if (lextable[pos] < FAIL_CHAR) /* terminal marker */
65 return pos;
Andy Green6d1fcb72013-01-18 01:55:48 +080066
Andy Greenbbc5c072014-03-09 11:49:21 +080067 if (lextable[pos] == c) /* goto */
68 return pos + (lextable[pos + 1]) +
69 (lextable[pos + 2] << 8);
70 /* fall thru goto */
71 pos += 3;
72 /* continue */
73 }
Andy Green6d1fcb72013-01-18 01:55:48 +080074 }
Andy Green6d1fcb72013-01-18 01:55:48 +080075}
76
Andy Green6d1fcb72013-01-18 01:55:48 +080077int main(void)
78{
79 int n = 0;
80 int m = 0;
81 int prev;
82 char c;
83 int walk;
84 int saw;
85 int y;
Andy Greenbbc5c072014-03-09 11:49:21 +080086 int j;
87 int pos = 0;
Andy Green6d1fcb72013-01-18 01:55:48 +080088
89 while (n < sizeof(set) / sizeof(set[0])) {
90
91 m = 0;
92 walk = 0;
93 prev = 0;
94
Andy Green909a3722013-11-13 08:03:05 +080095 if (set[n][0] == '\0') {
96 n++;
97 continue;
98 }
99
Andy Green6d1fcb72013-01-18 01:55:48 +0800100 while (set[n][m]) {
101
102 saw = 0;
103 for (y = 0; y < state[walk].count; y++)
Andy Greencc13c6f2013-11-09 10:09:09 +0800104 if (state[walk].c[y] == set[n][m]) {
105 /* exists -- go forward */
106 walk = state[walk].state[y];
Andy Green6d1fcb72013-01-18 01:55:48 +0800107 saw = 1;
108 break;
109 }
110
111 if (saw)
112 goto again;
113
114 /* something we didn't see before */
115
116 state[walk].c[state[walk].count] = set[n][m];
117
118 state[walk].state[state[walk].count] = next;
119 state[walk].count++;
Andy Greencc13c6f2013-11-09 10:09:09 +0800120 walk = next++;
Andy Green6d1fcb72013-01-18 01:55:48 +0800121again:
122 m++;
123 }
124
Andy Greencc13c6f2013-11-09 10:09:09 +0800125 state[walk].c[0] = n++;
Andy Green6d1fcb72013-01-18 01:55:48 +0800126 state[walk].state[0] = 0; /* terminal marker */
127 state[walk].count = 1;
Andy Green6d1fcb72013-01-18 01:55:48 +0800128 }
129
130 walk = 0;
131 for (n = 0; n < next; n++) {
132 state[n].bytepos = walk;
133 walk += (2 * state[n].count);
134 }
Andy Green6d1fcb72013-01-18 01:55:48 +0800135
Andy Greenbbc5c072014-03-09 11:49:21 +0800136 /* compute everyone's position first */
137
138 pos = 0;
Andy Green6d1fcb72013-01-18 01:55:48 +0800139 walk = 0;
140 for (n = 0; n < next; n++) {
Andy Greenbbc5c072014-03-09 11:49:21 +0800141
142 state[n].real_pos = pos;
143
Andy Green6d1fcb72013-01-18 01:55:48 +0800144 for (m = 0; m < state[n].count; m++) {
Andy Greencc13c6f2013-11-09 10:09:09 +0800145
Andy Greenbbc5c072014-03-09 11:49:21 +0800146 if (state[n].state[m] == 0)
147 pos += 2; /* terminal marker */
148 else { /* c is a character */
149 if ((state[state[n].state[m]].bytepos -
150 walk) == 2)
151 pos++;
152 else {
153 pos += 3;
154 if (m == state[n].count - 1)
155 pos++; /* fail */
Andy Green6d1fcb72013-01-18 01:55:48 +0800156 }
Andy Green6d1fcb72013-01-18 01:55:48 +0800157 }
158 walk += 2;
159 }
160 }
161
Andy Greenbbc5c072014-03-09 11:49:21 +0800162 walk = 0;
163 pos = 0;
164 for (n = 0; n < next; n++) {
165 for (m = 0; m < state[n].count; m++) {
166
167 if (!m)
168 fprintf(stdout, "/* pos %04x: %3d */ ",
169 state[n].real_pos, n);
170 else
171 fprintf(stdout, " ");
172
173 y = state[n].c[m];
174 saw = state[n].state[m];
175
176 if (saw == 0) { // c is a terminal then
177
178 if (y > 0x7ff) {
179 fprintf(stderr, "terminal too big\n");
180 return 2;
181 }
182
183 fprintf(stdout, " 0x%02X, 0x%02X "
184 " "
185 "/* - terminal marker %2d - */,\n",
186 y >> 8, y & 0xff, y & 0x7f);
187 pos += 2;
188 walk += 2;
189 continue;
190 }
191
192 /* c is a character */
193
194 prev = y &0x7f;
195 if (prev < 32 || prev > 126)
196 prev = '.';
197
198
199 if ((state[saw].bytepos - walk) == 2) {
200 fprintf(stdout, " 0x%02X /* '%c' -> */,\n",
201 y | 0x80, prev);
202 pos++;
203 walk += 2;
204 continue;
205 }
206
207 j = state[saw].real_pos - pos;
208
209 if (j > 0xffff) {
210 fprintf(stderr,
211 "Jump > 64K bytes ahead (%d to %d)\n",
212 state[n].real_pos, state[saw].real_pos);
213 return 1;
214 }
215 fprintf(stdout, " 0x%02X /* '%c' */, 0x%02X, 0x%02X "
216 "/* (to 0x%04X state %3d) */,\n",
217 y, prev,
218 j & 0xff, j >> 8,
219 state[saw].real_pos, saw);
220 pos += 3;
221
222 if (m == state[n].count - 1) {
223 fprintf(stdout,
224 " 0x%02X, /* fail */\n",
225 FAIL_CHAR);
226 pos++; /* fail */
227 }
228
229 walk += 2;
230 }
231 }
232
233 fprintf(stdout, "/* total size %d bytes */\n", pos);
Andy Greencc13c6f2013-11-09 10:09:09 +0800234
235 /*
Andy Greenbbc5c072014-03-09 11:49:21 +0800236 * Try to parse every legal input string
Andy Greencc13c6f2013-11-09 10:09:09 +0800237 */
Andy Green6d1fcb72013-01-18 01:55:48 +0800238
239 for (n = 0; n < sizeof(set) / sizeof(set[0]); n++) {
240 walk = 0;
241 m = 0;
Andy Greenbbc5c072014-03-09 11:49:21 +0800242 y = -1;
Andy Green6d1fcb72013-01-18 01:55:48 +0800243
Andy Green909a3722013-11-13 08:03:05 +0800244 if (set[n][0] == '\0')
245 continue;
246
Andy Greenbbc5c072014-03-09 11:49:21 +0800247 fprintf(stderr, " trying '%s'\n", set[n]);
Andy Green6d1fcb72013-01-18 01:55:48 +0800248
249 while (set[n][m]) {
250 walk = lextable_decode(walk, set[n][m]);
251 if (walk < 0) {
252 fprintf(stderr, "failed\n");
Andy Greenbbc5c072014-03-09 11:49:21 +0800253 return 3;
Andy Green6d1fcb72013-01-18 01:55:48 +0800254 }
Andy Greenbbc5c072014-03-09 11:49:21 +0800255
256 if (lextable[walk] < FAIL_CHAR) {
257 y = (lextable[walk] << 8) + lextable[walk + 1];
Andy Green6d1fcb72013-01-18 01:55:48 +0800258 break;
259 }
260 m++;
261 }
Andy Greenbbc5c072014-03-09 11:49:21 +0800262
263 if (y != n) {
264 fprintf(stderr, "decode failed %d\n", y);
265 return 4;
266 }
Andy Green6d1fcb72013-01-18 01:55:48 +0800267 }
268
Andy Greenbbc5c072014-03-09 11:49:21 +0800269 fprintf(stderr, "All decode OK\n");
270
Andy Green6d1fcb72013-01-18 01:55:48 +0800271 return 0;
272}