Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 1 | /* |
| 2 | * minilex.c |
| 3 | * |
| 4 | * High efficiency lexical state parser |
| 5 | * |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 6 | * Copyright (C)2011-2014 Andy Green <andy@warmcat.com> |
Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 7 | * |
| 8 | * Licensed under LGPL2 |
| 9 | * |
| 10 | * Usage: gcc minilex.c -o minilex && ./minilex > lextable.h |
| 11 | * |
| 12 | * Run it twice to test parsing on the generated table on stderr |
| 13 | */ |
| 14 | |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 15 | #include <stdio.h> |
| 16 | #include <stdlib.h> |
| 17 | #include <string.h> |
| 18 | |
Andy Green | 917f43a | 2014-10-12 14:31:47 +0800 | [diff] [blame] | 19 | #include "lextable-strings.h" |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 20 | |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 21 | /* |
| 22 | * b7 = 0 = 1-byte seq |
| 23 | * 0x08 = fail |
| 24 | * 2-byte seq |
Andy Green | 40110e8 | 2015-12-14 08:52:03 +0800 | [diff] [blame] | 25 | * 0x00 - 0x07, then terminal as given in 2nd byte |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 26 | 3-byte seq |
Andy Green | 40110e8 | 2015-12-14 08:52:03 +0800 | [diff] [blame] | 27 | * no match: go fwd 3 byte, match: jump fwd by amt in +1/+2 bytes |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 28 | * = 1 = 1-byte seq |
Andy Green | 40110e8 | 2015-12-14 08:52:03 +0800 | [diff] [blame] | 29 | * no match: die, match go fwd 1 byte |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 30 | */ |
| 31 | |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 32 | unsigned char lextable[] = { |
Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 33 | #include "lextable.h" |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 34 | }; |
| 35 | |
| 36 | #define PARALLEL 30 |
| 37 | |
| 38 | struct state { |
| 39 | char c[PARALLEL]; |
| 40 | int state[PARALLEL]; |
| 41 | int count; |
| 42 | int bytepos; |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 43 | |
| 44 | int real_pos; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 45 | }; |
| 46 | |
| 47 | struct state state[1000]; |
| 48 | int next = 1; |
| 49 | |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 50 | #define FAIL_CHAR 0x08 |
Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 51 | |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 52 | int lextable_decode(int pos, char c) |
| 53 | { |
| 54 | while (1) { |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 55 | if (lextable[pos] & (1 << 7)) { /* 1-byte, fail on mismatch */ |
| 56 | if ((lextable[pos] & 0x7f) != c) |
| 57 | return -1; |
| 58 | /* fall thru */ |
| 59 | pos++; |
| 60 | if (lextable[pos] == FAIL_CHAR) |
| 61 | return -1; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 62 | return pos; |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 63 | } else { /* b7 = 0, end or 3-byte */ |
| 64 | if (lextable[pos] < FAIL_CHAR) /* terminal marker */ |
| 65 | return pos; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 66 | |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 67 | if (lextable[pos] == c) /* goto */ |
| 68 | return pos + (lextable[pos + 1]) + |
| 69 | (lextable[pos + 2] << 8); |
| 70 | /* fall thru goto */ |
| 71 | pos += 3; |
| 72 | /* continue */ |
| 73 | } |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 74 | } |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 75 | } |
| 76 | |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 77 | int main(void) |
| 78 | { |
| 79 | int n = 0; |
| 80 | int m = 0; |
| 81 | int prev; |
| 82 | char c; |
| 83 | int walk; |
| 84 | int saw; |
| 85 | int y; |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 86 | int j; |
| 87 | int pos = 0; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 88 | |
| 89 | while (n < sizeof(set) / sizeof(set[0])) { |
| 90 | |
| 91 | m = 0; |
| 92 | walk = 0; |
| 93 | prev = 0; |
| 94 | |
Andy Green | 909a372 | 2013-11-13 08:03:05 +0800 | [diff] [blame] | 95 | if (set[n][0] == '\0') { |
| 96 | n++; |
| 97 | continue; |
| 98 | } |
| 99 | |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 100 | while (set[n][m]) { |
| 101 | |
| 102 | saw = 0; |
| 103 | for (y = 0; y < state[walk].count; y++) |
Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 104 | if (state[walk].c[y] == set[n][m]) { |
| 105 | /* exists -- go forward */ |
| 106 | walk = state[walk].state[y]; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 107 | saw = 1; |
| 108 | break; |
| 109 | } |
| 110 | |
| 111 | if (saw) |
| 112 | goto again; |
| 113 | |
| 114 | /* something we didn't see before */ |
| 115 | |
| 116 | state[walk].c[state[walk].count] = set[n][m]; |
| 117 | |
| 118 | state[walk].state[state[walk].count] = next; |
| 119 | state[walk].count++; |
Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 120 | walk = next++; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 121 | again: |
| 122 | m++; |
| 123 | } |
| 124 | |
Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 125 | state[walk].c[0] = n++; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 126 | state[walk].state[0] = 0; /* terminal marker */ |
| 127 | state[walk].count = 1; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 128 | } |
| 129 | |
| 130 | walk = 0; |
| 131 | for (n = 0; n < next; n++) { |
| 132 | state[n].bytepos = walk; |
| 133 | walk += (2 * state[n].count); |
| 134 | } |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 135 | |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 136 | /* compute everyone's position first */ |
| 137 | |
| 138 | pos = 0; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 139 | walk = 0; |
| 140 | for (n = 0; n < next; n++) { |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 141 | |
| 142 | state[n].real_pos = pos; |
| 143 | |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 144 | for (m = 0; m < state[n].count; m++) { |
Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 145 | |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 146 | if (state[n].state[m] == 0) |
| 147 | pos += 2; /* terminal marker */ |
| 148 | else { /* c is a character */ |
| 149 | if ((state[state[n].state[m]].bytepos - |
| 150 | walk) == 2) |
| 151 | pos++; |
| 152 | else { |
| 153 | pos += 3; |
| 154 | if (m == state[n].count - 1) |
| 155 | pos++; /* fail */ |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 156 | } |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 157 | } |
| 158 | walk += 2; |
| 159 | } |
| 160 | } |
| 161 | |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 162 | walk = 0; |
| 163 | pos = 0; |
| 164 | for (n = 0; n < next; n++) { |
| 165 | for (m = 0; m < state[n].count; m++) { |
| 166 | |
| 167 | if (!m) |
| 168 | fprintf(stdout, "/* pos %04x: %3d */ ", |
| 169 | state[n].real_pos, n); |
| 170 | else |
| 171 | fprintf(stdout, " "); |
| 172 | |
| 173 | y = state[n].c[m]; |
| 174 | saw = state[n].state[m]; |
| 175 | |
| 176 | if (saw == 0) { // c is a terminal then |
| 177 | |
| 178 | if (y > 0x7ff) { |
| 179 | fprintf(stderr, "terminal too big\n"); |
| 180 | return 2; |
| 181 | } |
| 182 | |
| 183 | fprintf(stdout, " 0x%02X, 0x%02X " |
| 184 | " " |
| 185 | "/* - terminal marker %2d - */,\n", |
| 186 | y >> 8, y & 0xff, y & 0x7f); |
| 187 | pos += 2; |
| 188 | walk += 2; |
| 189 | continue; |
| 190 | } |
| 191 | |
| 192 | /* c is a character */ |
| 193 | |
| 194 | prev = y &0x7f; |
| 195 | if (prev < 32 || prev > 126) |
| 196 | prev = '.'; |
| 197 | |
| 198 | |
| 199 | if ((state[saw].bytepos - walk) == 2) { |
| 200 | fprintf(stdout, " 0x%02X /* '%c' -> */,\n", |
| 201 | y | 0x80, prev); |
| 202 | pos++; |
| 203 | walk += 2; |
| 204 | continue; |
| 205 | } |
| 206 | |
| 207 | j = state[saw].real_pos - pos; |
| 208 | |
| 209 | if (j > 0xffff) { |
| 210 | fprintf(stderr, |
| 211 | "Jump > 64K bytes ahead (%d to %d)\n", |
| 212 | state[n].real_pos, state[saw].real_pos); |
| 213 | return 1; |
| 214 | } |
| 215 | fprintf(stdout, " 0x%02X /* '%c' */, 0x%02X, 0x%02X " |
| 216 | "/* (to 0x%04X state %3d) */,\n", |
| 217 | y, prev, |
| 218 | j & 0xff, j >> 8, |
| 219 | state[saw].real_pos, saw); |
| 220 | pos += 3; |
| 221 | |
| 222 | if (m == state[n].count - 1) { |
| 223 | fprintf(stdout, |
| 224 | " 0x%02X, /* fail */\n", |
| 225 | FAIL_CHAR); |
| 226 | pos++; /* fail */ |
| 227 | } |
| 228 | |
| 229 | walk += 2; |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | fprintf(stdout, "/* total size %d bytes */\n", pos); |
Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 234 | |
| 235 | /* |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 236 | * Try to parse every legal input string |
Andy Green | cc13c6f | 2013-11-09 10:09:09 +0800 | [diff] [blame] | 237 | */ |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 238 | |
| 239 | for (n = 0; n < sizeof(set) / sizeof(set[0]); n++) { |
| 240 | walk = 0; |
| 241 | m = 0; |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 242 | y = -1; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 243 | |
Andy Green | 909a372 | 2013-11-13 08:03:05 +0800 | [diff] [blame] | 244 | if (set[n][0] == '\0') |
| 245 | continue; |
| 246 | |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 247 | fprintf(stderr, " trying '%s'\n", set[n]); |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 248 | |
| 249 | while (set[n][m]) { |
| 250 | walk = lextable_decode(walk, set[n][m]); |
| 251 | if (walk < 0) { |
| 252 | fprintf(stderr, "failed\n"); |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 253 | return 3; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 254 | } |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 255 | |
| 256 | if (lextable[walk] < FAIL_CHAR) { |
| 257 | y = (lextable[walk] << 8) + lextable[walk + 1]; |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 258 | break; |
| 259 | } |
| 260 | m++; |
| 261 | } |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 262 | |
| 263 | if (y != n) { |
| 264 | fprintf(stderr, "decode failed %d\n", y); |
| 265 | return 4; |
| 266 | } |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 267 | } |
| 268 | |
Andy Green | bbc5c07 | 2014-03-09 11:49:21 +0800 | [diff] [blame] | 269 | fprintf(stderr, "All decode OK\n"); |
| 270 | |
Andy Green | 6d1fcb7 | 2013-01-18 01:55:48 +0800 | [diff] [blame] | 271 | return 0; |
| 272 | } |