blob: 1aea0f6a0859ed3276ae79cc65d5d0b9b8a0b746 [file] [log] [blame]
Andy Greencc13c6f2013-11-09 10:09:09 +08001/*
2 * minilex.c
3 *
4 * High efficiency lexical state parser
5 *
Andy Greenbbc5c072014-03-09 11:49:21 +08006 * Copyright (C)2011-2014 Andy Green <andy@warmcat.com>
Andy Greencc13c6f2013-11-09 10:09:09 +08007 *
8 * Licensed under LGPL2
9 *
10 * Usage: gcc minilex.c -o minilex && ./minilex > lextable.h
11 *
12 * Run it twice to test parsing on the generated table on stderr
13 */
14
Andy Green6d1fcb72013-01-18 01:55:48 +080015#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18
Andy Green917f43a2014-10-12 14:31:47 +080019#include "lextable-strings.h"
Andy Green6d1fcb72013-01-18 01:55:48 +080020
Andy Greenbbc5c072014-03-09 11:49:21 +080021/*
22 * b7 = 0 = 1-byte seq
23 * 0x08 = fail
24 * 2-byte seq
25 * 0x00 - 0x07, then terminal as given in 2nd byte
26 3-byte seq
27 * no match: go fwd 3 byte, match: jump fwd by amt in +1/+2 bytes
28 * = 1 = 1-byte seq
29 * no match: die, match go fwd 1 byte
30 */
31
Andy Green6d1fcb72013-01-18 01:55:48 +080032unsigned char lextable[] = {
Andy Greencc13c6f2013-11-09 10:09:09 +080033 #include "lextable.h"
Andy Green6d1fcb72013-01-18 01:55:48 +080034};
35
36#define PARALLEL 30
37
38struct state {
39 char c[PARALLEL];
40 int state[PARALLEL];
41 int count;
42 int bytepos;
Andy Greenbbc5c072014-03-09 11:49:21 +080043
44 int real_pos;
Andy Green6d1fcb72013-01-18 01:55:48 +080045};
46
47struct state state[1000];
48int next = 1;
49
Andy Greenbbc5c072014-03-09 11:49:21 +080050#define FAIL_CHAR 0x08
Andy Greencc13c6f2013-11-09 10:09:09 +080051
Andy Green6d1fcb72013-01-18 01:55:48 +080052int lextable_decode(int pos, char c)
53{
Andy Greenbbc5c072014-03-09 11:49:21 +080054
Andy Green6d1fcb72013-01-18 01:55:48 +080055 while (1) {
Andy Greenbbc5c072014-03-09 11:49:21 +080056 if (lextable[pos] & (1 << 7)) { /* 1-byte, fail on mismatch */
57 if ((lextable[pos] & 0x7f) != c)
58 return -1;
59 /* fall thru */
60 pos++;
61 if (lextable[pos] == FAIL_CHAR)
62 return -1;
Andy Green6d1fcb72013-01-18 01:55:48 +080063 return pos;
Andy Greenbbc5c072014-03-09 11:49:21 +080064 } else { /* b7 = 0, end or 3-byte */
65 if (lextable[pos] < FAIL_CHAR) /* terminal marker */
66 return pos;
Andy Green6d1fcb72013-01-18 01:55:48 +080067
Andy Greenbbc5c072014-03-09 11:49:21 +080068 if (lextable[pos] == c) /* goto */
69 return pos + (lextable[pos + 1]) +
70 (lextable[pos + 2] << 8);
71 /* fall thru goto */
72 pos += 3;
73 /* continue */
74 }
Andy Green6d1fcb72013-01-18 01:55:48 +080075 }
Andy Green6d1fcb72013-01-18 01:55:48 +080076}
77
Andy Green6d1fcb72013-01-18 01:55:48 +080078int main(void)
79{
80 int n = 0;
81 int m = 0;
82 int prev;
83 char c;
84 int walk;
85 int saw;
86 int y;
Andy Greenbbc5c072014-03-09 11:49:21 +080087 int j;
88 int pos = 0;
Andy Green6d1fcb72013-01-18 01:55:48 +080089
90 while (n < sizeof(set) / sizeof(set[0])) {
91
92 m = 0;
93 walk = 0;
94 prev = 0;
95
Andy Green909a3722013-11-13 08:03:05 +080096 if (set[n][0] == '\0') {
97 n++;
98 continue;
99 }
100
Andy Green6d1fcb72013-01-18 01:55:48 +0800101 while (set[n][m]) {
102
103 saw = 0;
104 for (y = 0; y < state[walk].count; y++)
Andy Greencc13c6f2013-11-09 10:09:09 +0800105 if (state[walk].c[y] == set[n][m]) {
106 /* exists -- go forward */
107 walk = state[walk].state[y];
Andy Green6d1fcb72013-01-18 01:55:48 +0800108 saw = 1;
109 break;
110 }
111
112 if (saw)
113 goto again;
114
115 /* something we didn't see before */
116
117 state[walk].c[state[walk].count] = set[n][m];
118
119 state[walk].state[state[walk].count] = next;
120 state[walk].count++;
Andy Greencc13c6f2013-11-09 10:09:09 +0800121 walk = next++;
Andy Green6d1fcb72013-01-18 01:55:48 +0800122again:
123 m++;
124 }
125
Andy Greencc13c6f2013-11-09 10:09:09 +0800126 state[walk].c[0] = n++;
Andy Green6d1fcb72013-01-18 01:55:48 +0800127 state[walk].state[0] = 0; /* terminal marker */
128 state[walk].count = 1;
Andy Green6d1fcb72013-01-18 01:55:48 +0800129 }
130
131 walk = 0;
132 for (n = 0; n < next; n++) {
133 state[n].bytepos = walk;
134 walk += (2 * state[n].count);
135 }
Andy Green6d1fcb72013-01-18 01:55:48 +0800136
Andy Greenbbc5c072014-03-09 11:49:21 +0800137 /* compute everyone's position first */
138
139 pos = 0;
Andy Green6d1fcb72013-01-18 01:55:48 +0800140 walk = 0;
141 for (n = 0; n < next; n++) {
Andy Greenbbc5c072014-03-09 11:49:21 +0800142
143 state[n].real_pos = pos;
144
Andy Green6d1fcb72013-01-18 01:55:48 +0800145 for (m = 0; m < state[n].count; m++) {
Andy Greencc13c6f2013-11-09 10:09:09 +0800146
Andy Greenbbc5c072014-03-09 11:49:21 +0800147 if (state[n].state[m] == 0)
148 pos += 2; /* terminal marker */
149 else { /* c is a character */
150 if ((state[state[n].state[m]].bytepos -
151 walk) == 2)
152 pos++;
153 else {
154 pos += 3;
155 if (m == state[n].count - 1)
156 pos++; /* fail */
Andy Green6d1fcb72013-01-18 01:55:48 +0800157 }
Andy Green6d1fcb72013-01-18 01:55:48 +0800158 }
159 walk += 2;
160 }
161 }
162
Andy Greenbbc5c072014-03-09 11:49:21 +0800163 walk = 0;
164 pos = 0;
165 for (n = 0; n < next; n++) {
166 for (m = 0; m < state[n].count; m++) {
167
168 if (!m)
169 fprintf(stdout, "/* pos %04x: %3d */ ",
170 state[n].real_pos, n);
171 else
172 fprintf(stdout, " ");
173
174 y = state[n].c[m];
175 saw = state[n].state[m];
176
177 if (saw == 0) { // c is a terminal then
178
179 if (y > 0x7ff) {
180 fprintf(stderr, "terminal too big\n");
181 return 2;
182 }
183
184 fprintf(stdout, " 0x%02X, 0x%02X "
185 " "
186 "/* - terminal marker %2d - */,\n",
187 y >> 8, y & 0xff, y & 0x7f);
188 pos += 2;
189 walk += 2;
190 continue;
191 }
192
193 /* c is a character */
194
195 prev = y &0x7f;
196 if (prev < 32 || prev > 126)
197 prev = '.';
198
199
200 if ((state[saw].bytepos - walk) == 2) {
201 fprintf(stdout, " 0x%02X /* '%c' -> */,\n",
202 y | 0x80, prev);
203 pos++;
204 walk += 2;
205 continue;
206 }
207
208 j = state[saw].real_pos - pos;
209
210 if (j > 0xffff) {
211 fprintf(stderr,
212 "Jump > 64K bytes ahead (%d to %d)\n",
213 state[n].real_pos, state[saw].real_pos);
214 return 1;
215 }
216 fprintf(stdout, " 0x%02X /* '%c' */, 0x%02X, 0x%02X "
217 "/* (to 0x%04X state %3d) */,\n",
218 y, prev,
219 j & 0xff, j >> 8,
220 state[saw].real_pos, saw);
221 pos += 3;
222
223 if (m == state[n].count - 1) {
224 fprintf(stdout,
225 " 0x%02X, /* fail */\n",
226 FAIL_CHAR);
227 pos++; /* fail */
228 }
229
230 walk += 2;
231 }
232 }
233
234 fprintf(stdout, "/* total size %d bytes */\n", pos);
Andy Greencc13c6f2013-11-09 10:09:09 +0800235
236 /*
Andy Greenbbc5c072014-03-09 11:49:21 +0800237 * Try to parse every legal input string
Andy Greencc13c6f2013-11-09 10:09:09 +0800238 */
Andy Green6d1fcb72013-01-18 01:55:48 +0800239
240 for (n = 0; n < sizeof(set) / sizeof(set[0]); n++) {
241 walk = 0;
242 m = 0;
Andy Greenbbc5c072014-03-09 11:49:21 +0800243 y = -1;
Andy Green6d1fcb72013-01-18 01:55:48 +0800244
Andy Green909a3722013-11-13 08:03:05 +0800245 if (set[n][0] == '\0')
246 continue;
247
Andy Greenbbc5c072014-03-09 11:49:21 +0800248 fprintf(stderr, " trying '%s'\n", set[n]);
Andy Green6d1fcb72013-01-18 01:55:48 +0800249
250 while (set[n][m]) {
251 walk = lextable_decode(walk, set[n][m]);
252 if (walk < 0) {
253 fprintf(stderr, "failed\n");
Andy Greenbbc5c072014-03-09 11:49:21 +0800254 return 3;
Andy Green6d1fcb72013-01-18 01:55:48 +0800255 }
Andy Greenbbc5c072014-03-09 11:49:21 +0800256
257 if (lextable[walk] < FAIL_CHAR) {
258 y = (lextable[walk] << 8) + lextable[walk + 1];
Andy Green6d1fcb72013-01-18 01:55:48 +0800259 break;
260 }
261 m++;
262 }
Andy Greenbbc5c072014-03-09 11:49:21 +0800263
264 if (y != n) {
265 fprintf(stderr, "decode failed %d\n", y);
266 return 4;
267 }
Andy Green6d1fcb72013-01-18 01:55:48 +0800268 }
269
Andy Greenbbc5c072014-03-09 11:49:21 +0800270 fprintf(stderr, "All decode OK\n");
271
Andy Green6d1fcb72013-01-18 01:55:48 +0800272 return 0;
273}