blob: 534fcf76449de9453627144eee55fbd6db26442a [file] [log] [blame]
David 'Digit' Turner2910f182010-05-10 18:48:35 -07001/*
2 * JSON lexer
3 *
4 * Copyright IBM, Corp. 2009
5 *
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10 * See the COPYING.LIB file in the top-level directory.
11 *
12 */
13
14#include "qstring.h"
15#include "qlist.h"
16#include "qdict.h"
17#include "qint.h"
18#include "qemu-common.h"
19#include "json-lexer.h"
20
21/*
22 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
23 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
24 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
25 * [{}\[\],:]
26 * [a-z]+
27 *
28 */
29
David 'Digit' Turner1cea2fb2010-12-22 17:38:44 +010030/* Building with mingw results in an error because ERROR is defined as a
31 * macro in this environment. Undefined it */
32#undef ERROR
33
David 'Digit' Turner2910f182010-05-10 18:48:35 -070034enum json_lexer_state {
David 'Digit' Turner1cea2fb2010-12-22 17:38:44 +010035 ERROR = 0,
David 'Digit' Turner2910f182010-05-10 18:48:35 -070036 IN_DQ_UCODE3,
37 IN_DQ_UCODE2,
38 IN_DQ_UCODE1,
39 IN_DQ_UCODE0,
40 IN_DQ_STRING_ESCAPE,
41 IN_DQ_STRING,
42 IN_SQ_UCODE3,
43 IN_SQ_UCODE2,
44 IN_SQ_UCODE1,
45 IN_SQ_UCODE0,
46 IN_SQ_STRING_ESCAPE,
47 IN_SQ_STRING,
48 IN_ZERO,
49 IN_DIGITS,
50 IN_DIGIT,
51 IN_EXP_E,
52 IN_MANTISSA,
53 IN_MANTISSA_DIGITS,
54 IN_NONZERO_NUMBER,
55 IN_NEG_NONZERO_NUMBER,
56 IN_KEYWORD,
57 IN_ESCAPE,
58 IN_ESCAPE_L,
59 IN_ESCAPE_LL,
David 'Digit' Turner92518662010-05-10 23:26:01 -070060 IN_ESCAPE_I,
61 IN_ESCAPE_I6,
62 IN_ESCAPE_I64,
David 'Digit' Turner2910f182010-05-10 18:48:35 -070063 IN_WHITESPACE,
David 'Digit' Turner2910f182010-05-10 18:48:35 -070064 IN_START,
65};
66
67#define TERMINAL(state) [0 ... 0x7F] = (state)
68
David Turner9a5f7ce2010-09-09 22:59:04 +020069/* Return whether TERMINAL is a terminal state and the transition to it
70 from OLD_STATE required lookahead. This happens whenever the table
71 below uses the TERMINAL macro. */
72#define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
73 (json_lexer[(old_state)][0] == (terminal))
David 'Digit' Turner2910f182010-05-10 18:48:35 -070074
David Turner9a5f7ce2010-09-09 22:59:04 +020075static const uint8_t json_lexer[][256] = {
David 'Digit' Turner2910f182010-05-10 18:48:35 -070076 /* double quote string */
77 [IN_DQ_UCODE3] = {
78 ['0' ... '9'] = IN_DQ_STRING,
79 ['a' ... 'f'] = IN_DQ_STRING,
80 ['A' ... 'F'] = IN_DQ_STRING,
81 },
82 [IN_DQ_UCODE2] = {
83 ['0' ... '9'] = IN_DQ_UCODE3,
84 ['a' ... 'f'] = IN_DQ_UCODE3,
85 ['A' ... 'F'] = IN_DQ_UCODE3,
86 },
87 [IN_DQ_UCODE1] = {
88 ['0' ... '9'] = IN_DQ_UCODE2,
89 ['a' ... 'f'] = IN_DQ_UCODE2,
90 ['A' ... 'F'] = IN_DQ_UCODE2,
91 },
92 [IN_DQ_UCODE0] = {
93 ['0' ... '9'] = IN_DQ_UCODE1,
94 ['a' ... 'f'] = IN_DQ_UCODE1,
95 ['A' ... 'F'] = IN_DQ_UCODE1,
96 },
97 [IN_DQ_STRING_ESCAPE] = {
98 ['b'] = IN_DQ_STRING,
99 ['f'] = IN_DQ_STRING,
100 ['n'] = IN_DQ_STRING,
101 ['r'] = IN_DQ_STRING,
102 ['t'] = IN_DQ_STRING,
David Turner9a5f7ce2010-09-09 22:59:04 +0200103 ['/'] = IN_DQ_STRING,
104 ['\\'] = IN_DQ_STRING,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700105 ['\''] = IN_DQ_STRING,
106 ['\"'] = IN_DQ_STRING,
107 ['u'] = IN_DQ_UCODE0,
108 },
109 [IN_DQ_STRING] = {
110 [1 ... 0xFF] = IN_DQ_STRING,
111 ['\\'] = IN_DQ_STRING_ESCAPE,
David Turner9a5f7ce2010-09-09 22:59:04 +0200112 ['"'] = JSON_STRING,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700113 },
114
115 /* single quote string */
116 [IN_SQ_UCODE3] = {
117 ['0' ... '9'] = IN_SQ_STRING,
118 ['a' ... 'f'] = IN_SQ_STRING,
119 ['A' ... 'F'] = IN_SQ_STRING,
120 },
121 [IN_SQ_UCODE2] = {
122 ['0' ... '9'] = IN_SQ_UCODE3,
123 ['a' ... 'f'] = IN_SQ_UCODE3,
124 ['A' ... 'F'] = IN_SQ_UCODE3,
125 },
126 [IN_SQ_UCODE1] = {
127 ['0' ... '9'] = IN_SQ_UCODE2,
128 ['a' ... 'f'] = IN_SQ_UCODE2,
129 ['A' ... 'F'] = IN_SQ_UCODE2,
130 },
131 [IN_SQ_UCODE0] = {
132 ['0' ... '9'] = IN_SQ_UCODE1,
133 ['a' ... 'f'] = IN_SQ_UCODE1,
134 ['A' ... 'F'] = IN_SQ_UCODE1,
135 },
136 [IN_SQ_STRING_ESCAPE] = {
137 ['b'] = IN_SQ_STRING,
138 ['f'] = IN_SQ_STRING,
139 ['n'] = IN_SQ_STRING,
140 ['r'] = IN_SQ_STRING,
141 ['t'] = IN_SQ_STRING,
David Turner9a5f7ce2010-09-09 22:59:04 +0200142 ['/'] = IN_DQ_STRING,
143 ['\\'] = IN_DQ_STRING,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700144 ['\''] = IN_SQ_STRING,
145 ['\"'] = IN_SQ_STRING,
146 ['u'] = IN_SQ_UCODE0,
147 },
148 [IN_SQ_STRING] = {
149 [1 ... 0xFF] = IN_SQ_STRING,
150 ['\\'] = IN_SQ_STRING_ESCAPE,
David Turner9a5f7ce2010-09-09 22:59:04 +0200151 ['\''] = JSON_STRING,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700152 },
153
154 /* Zero */
155 [IN_ZERO] = {
156 TERMINAL(JSON_INTEGER),
David 'Digit' Turner1cea2fb2010-12-22 17:38:44 +0100157 ['0' ... '9'] = ERROR,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700158 ['.'] = IN_MANTISSA,
159 },
160
161 /* Float */
162 [IN_DIGITS] = {
163 TERMINAL(JSON_FLOAT),
164 ['0' ... '9'] = IN_DIGITS,
165 },
166
167 [IN_DIGIT] = {
168 ['0' ... '9'] = IN_DIGITS,
169 },
170
171 [IN_EXP_E] = {
172 ['-'] = IN_DIGIT,
173 ['+'] = IN_DIGIT,
174 ['0' ... '9'] = IN_DIGITS,
175 },
176
177 [IN_MANTISSA_DIGITS] = {
178 TERMINAL(JSON_FLOAT),
179 ['0' ... '9'] = IN_MANTISSA_DIGITS,
180 ['e'] = IN_EXP_E,
181 ['E'] = IN_EXP_E,
182 },
183
184 [IN_MANTISSA] = {
185 ['0' ... '9'] = IN_MANTISSA_DIGITS,
186 },
187
188 /* Number */
189 [IN_NONZERO_NUMBER] = {
190 TERMINAL(JSON_INTEGER),
191 ['0' ... '9'] = IN_NONZERO_NUMBER,
192 ['e'] = IN_EXP_E,
193 ['E'] = IN_EXP_E,
194 ['.'] = IN_MANTISSA,
195 },
196
197 [IN_NEG_NONZERO_NUMBER] = {
198 ['0'] = IN_ZERO,
199 ['1' ... '9'] = IN_NONZERO_NUMBER,
200 },
201
202 /* keywords */
203 [IN_KEYWORD] = {
204 TERMINAL(JSON_KEYWORD),
205 ['a' ... 'z'] = IN_KEYWORD,
206 },
207
208 /* whitespace */
209 [IN_WHITESPACE] = {
210 TERMINAL(JSON_SKIP),
211 [' '] = IN_WHITESPACE,
212 ['\t'] = IN_WHITESPACE,
213 ['\r'] = IN_WHITESPACE,
214 ['\n'] = IN_WHITESPACE,
215 },
216
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700217 /* escape */
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700218 [IN_ESCAPE_LL] = {
David Turner9a5f7ce2010-09-09 22:59:04 +0200219 ['d'] = JSON_ESCAPE,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700220 },
221
222 [IN_ESCAPE_L] = {
David Turner9a5f7ce2010-09-09 22:59:04 +0200223 ['d'] = JSON_ESCAPE,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700224 ['l'] = IN_ESCAPE_LL,
225 },
226
David 'Digit' Turner92518662010-05-10 23:26:01 -0700227 [IN_ESCAPE_I64] = {
David Turner9a5f7ce2010-09-09 22:59:04 +0200228 ['d'] = JSON_ESCAPE,
David 'Digit' Turner92518662010-05-10 23:26:01 -0700229 },
230
231 [IN_ESCAPE_I6] = {
232 ['4'] = IN_ESCAPE_I64,
233 },
234
235 [IN_ESCAPE_I] = {
236 ['6'] = IN_ESCAPE_I6,
237 },
238
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700239 [IN_ESCAPE] = {
David Turner9a5f7ce2010-09-09 22:59:04 +0200240 ['d'] = JSON_ESCAPE,
241 ['i'] = JSON_ESCAPE,
242 ['p'] = JSON_ESCAPE,
243 ['s'] = JSON_ESCAPE,
244 ['f'] = JSON_ESCAPE,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700245 ['l'] = IN_ESCAPE_L,
David 'Digit' Turner92518662010-05-10 23:26:01 -0700246 ['I'] = IN_ESCAPE_I,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700247 },
248
249 /* top level rule */
250 [IN_START] = {
251 ['"'] = IN_DQ_STRING,
252 ['\''] = IN_SQ_STRING,
253 ['0'] = IN_ZERO,
254 ['1' ... '9'] = IN_NONZERO_NUMBER,
255 ['-'] = IN_NEG_NONZERO_NUMBER,
David Turner9a5f7ce2010-09-09 22:59:04 +0200256 ['{'] = JSON_OPERATOR,
257 ['}'] = JSON_OPERATOR,
258 ['['] = JSON_OPERATOR,
259 [']'] = JSON_OPERATOR,
260 [','] = JSON_OPERATOR,
261 [':'] = JSON_OPERATOR,
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700262 ['a' ... 'z'] = IN_KEYWORD,
263 ['%'] = IN_ESCAPE,
264 [' '] = IN_WHITESPACE,
265 ['\t'] = IN_WHITESPACE,
266 ['\r'] = IN_WHITESPACE,
267 ['\n'] = IN_WHITESPACE,
268 },
269};
270
271void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
272{
273 lexer->emit = func;
274 lexer->state = IN_START;
275 lexer->token = qstring_new();
David Turner9a5f7ce2010-09-09 22:59:04 +0200276 lexer->x = lexer->y = 0;
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700277}
278
279static int json_lexer_feed_char(JSONLexer *lexer, char ch)
280{
David Turner9a5f7ce2010-09-09 22:59:04 +0200281 int char_consumed, new_state;
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700282
283 lexer->x++;
284 if (ch == '\n') {
285 lexer->x = 0;
286 lexer->y++;
287 }
288
David Turner9a5f7ce2010-09-09 22:59:04 +0200289 do {
290 new_state = json_lexer[lexer->state][(uint8_t)ch];
291 char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
292 if (char_consumed) {
293 qstring_append_chr(lexer->token, ch);
294 }
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700295
David Turner9a5f7ce2010-09-09 22:59:04 +0200296 switch (new_state) {
297 case JSON_OPERATOR:
298 case JSON_ESCAPE:
299 case JSON_INTEGER:
300 case JSON_FLOAT:
301 case JSON_KEYWORD:
302 case JSON_STRING:
303 lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
304 case JSON_SKIP:
305 QDECREF(lexer->token);
306 lexer->token = qstring_new();
307 new_state = IN_START;
308 break;
David 'Digit' Turner1cea2fb2010-12-22 17:38:44 +0100309 case ERROR:
David Turner9a5f7ce2010-09-09 22:59:04 +0200310 return -EINVAL;
311 default:
312 break;
313 }
314 lexer->state = new_state;
315 } while (!char_consumed);
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700316 return 0;
317}
318
319int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
320{
321 size_t i;
322
323 for (i = 0; i < size; i++) {
324 int err;
325
326 err = json_lexer_feed_char(lexer, buffer[i]);
327 if (err < 0) {
328 return err;
329 }
330 }
331
332 return 0;
333}
334
335int json_lexer_flush(JSONLexer *lexer)
336{
David Turner9a5f7ce2010-09-09 22:59:04 +0200337 return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0);
David 'Digit' Turner2910f182010-05-10 18:48:35 -0700338}
339
340void json_lexer_destroy(JSONLexer *lexer)
341{
342 QDECREF(lexer->token);
343}