blob: 9eff615296c3a6fc199e58affcec962cc9116d5f [file] [log] [blame]
Andy Greencd0c6962016-03-28 10:12:37 +08001/*
2 * Lightweight Embedded JSON Parser
3 *
4 * Copyright (C) 2013 Andy Green <andy@warmcat.com>
5 * This code is licensed under LGPL 2.1
6 * http://www.gnu.org/licenses/lgpl-2.1.html
7 */
8
9#include <string.h>
10#include "lejp.h"
11
12#include <stdio.h>
13
14/**
15 * lejp_construct - prepare a struct lejp_ctx for use
16 *
17 * @ctx: pointer to your struct lejp_ctx
18 * @callback: your user callback which will received parsed tokens
19 * @user: optional user data pointer untouched by lejp
20 * @paths: your array of name elements you are interested in
21 * @count_paths: ARRAY_SIZE() of @paths
22 *
23 * Prepares your context struct for use with lejp
24 */
25
26void
27lejp_construct(struct lejp_ctx *ctx,
28 char (*callback)(struct lejp_ctx *ctx, char reason), void *user,
29 const char * const *paths, unsigned char count_paths)
30{
31 ctx->st[0].s = 0;
32 ctx->st[0].p = 0;
33 ctx->st[0].i = 0;
34 ctx->st[0].b = 0;
35 ctx->sp = 0;
36 ctx->ipos = 0;
37 ctx->ppos = 0;
38 ctx->path_match = 0;
39 ctx->path[0] = '\0';
40 ctx->callback = callback;
41 ctx->user = user;
42 ctx->paths = paths;
43 ctx->count_paths = count_paths;
44 ctx->line = 1;
45 ctx->callback(ctx, LEJPCB_CONSTRUCTED);
46}
47
48/**
49 * lejp_destruct - retire a previously constructed struct lejp_ctx
50 *
51 * @ctx: pointer to your struct lejp_ctx
52 *
53 * lejp does not perform any allocations, but since your user code might, this
54 * provides a one-time LEJPCB_DESTRUCTED callback at destruction time where
55 * you can clean up in your callback.
56 */
57
58void
59lejp_destruct(struct lejp_ctx *ctx)
60{
61 /* no allocations... just let callback know what it happening */
62 ctx->callback(ctx, LEJPCB_DESTRUCTED);
63}
64
65/**
66 * lejp_change_callback - switch to a different callback from now on
67 *
68 * @ctx: pointer to your struct lejp_ctx
69 * @callback: your user callback which will received parsed tokens
70 *
71 * This tells the old callback it was destroyed, in case you want to take any
72 * action because that callback "lost focus", then changes to the new
73 * callback and tells it first that it was constructed, and then started.
74 *
75 * Changing callback is a cheap and powerful trick to split out handlers
76 * according to information earlier in the parse. For example you may have
77 * a JSON pair "schema" whose value defines what can be expected for the rest
78 * of the JSON. Rather than having one huge callback for all cases, you can
79 * have an initial one looking for "schema" which then calls
80 * lejp_change_callback() to a handler specific for the schema.
81 *
82 * Notice that afterwards, you need to construct the context again anyway to
83 * parse another JSON object, and the callback is reset then to the main,
84 * schema-interpreting one. The construction action is very lightweight.
85 */
86
87void
88lejp_change_callback(struct lejp_ctx *ctx,
89 char (*callback)(struct lejp_ctx *ctx, char reason))
90{
91 ctx->callback(ctx, LEJPCB_DESTRUCTED);
92 ctx->callback = callback;
93 ctx->callback(ctx, LEJPCB_CONSTRUCTED);
94 ctx->callback(ctx, LEJPCB_START);
95}
96
97static void
98lejp_check_path_match(struct lejp_ctx *ctx)
99{
100 int n;
101
102 /* we only need to check if a match is not active */
103 for (n = 0; !ctx->path_match && n < ctx->count_paths; n++) {
104 if (strcmp(ctx->path, ctx->paths[n]))
105 continue;
106 ctx->path_match = n + 1;
107 ctx->path_match_len = ctx->ppos;
108 return;
109 }
110}
111
112/**
113 * lejp_parse - interpret some more incoming data incrementally
114 *
115 * @ctx: previously constructed parsing context
116 * @json: char buffer with the new data to interpret
117 * @len: amount of data in the buffer
118 *
119 * Because lejp is a stream parser, it incrementally parses as new data
120 * becomes available, maintaining all state in the context struct. So an
121 * incomplete JSON is a normal situation, getting you a LEJP_CONTINUE
122 * return, signalling there's no error but to call again with more data when
123 * it comes to complete the parsing. Successful parsing completes with a
124 * 0 or positive integer indicating how much of the last input buffer was
125 * unused.
126 */
127
128int
129lejp_parse(struct lejp_ctx *ctx, const unsigned char *json, int len)
130{
131 unsigned char c, n, s, ret = LEJP_REJECT_UNKNOWN;
132 static const char esc_char[] = "\"\\/bfnrt";
133 static const char esc_tran[] = "\"\\/\b\f\n\r\t";
134 static const char tokens[] = "rue alse ull ";
135
136 if (!ctx->sp && !ctx->ppos)
137 ctx->callback(ctx, LEJPCB_START);
138
139 while (len--) {
140 c = *json++;
141
142 s = ctx->st[ctx->sp].s;
143
144 /* skip whitespace unless we should care */
145 if (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '#') {
146 if (c == '\n') {
147 ctx->line++;
148 ctx->st[ctx->sp].s &= ~LEJP_FLAG_WS_COMMENTLINE;
149 }
150 if (!(s & LEJP_FLAG_WS_KEEP)) {
151 if (c == '#')
152 ctx->st[ctx->sp].s |=
153 LEJP_FLAG_WS_COMMENTLINE;
154 continue;
155 }
156 }
157
158 if (ctx->st[ctx->sp].s & LEJP_FLAG_WS_COMMENTLINE)
159 continue;
160
161 switch (s) {
162 case LEJP_IDLE:
163 if (c != '{') {
164 ret = LEJP_REJECT_IDLE_NO_BRACE;
165 goto reject;
166 }
167 ctx->callback(ctx, LEJPCB_OBJECT_START);
168 ctx->st[ctx->sp].s = LEJP_MEMBERS;
169 break;
170 case LEJP_MEMBERS:
171 if (c == '}') {
172 ctx->st[ctx->sp].s = LEJP_IDLE;
173 ret = LEJP_REJECT_MEMBERS_NO_CLOSE;
174 goto reject;
175 }
176 ctx->st[ctx->sp].s = LEJP_M_P;
177 goto redo_character;
178 case LEJP_M_P:
179 if (c != '\"') {
180 ret = LEJP_REJECT_MP_NO_OPEN_QUOTE;
181 goto reject;
182 }
183 /* push */
184 ctx->st[ctx->sp].s = LEJP_MP_DELIM;
185 c = LEJP_MP_STRING;
186 goto add_stack_level;
187
188 case LEJP_MP_STRING:
189 if (c == '\"') {
190 if (!ctx->sp) {
191 ret = LEJP_REJECT_MP_STRING_UNDERRUN;
192 goto reject;
193 }
194 if (ctx->st[ctx->sp - 1].s != LEJP_MP_DELIM) {
195 ctx->buf[ctx->npos] = '\0';
196 if (ctx->callback(ctx,
197 LEJPCB_VAL_STR_END) < 0) {
198 ret = LEJP_REJECT_CALLBACK;
199 goto reject;
200 }
201 }
202 /* pop */
203 ctx->sp--;
204 break;
205 }
206 if (c == '\\') {
207 ctx->st[ctx->sp].s = LEJP_MP_STRING_ESC;
208 break;
209 }
210 if (c < ' ') {/* "control characters" not allowed */
211 ret = LEJP_REJECT_MP_ILLEGAL_CTRL;
212 goto reject;
213 }
214 goto emit_string_char;
215
216 case LEJP_MP_STRING_ESC:
217 if (c == 'u') {
218 ctx->st[ctx->sp].s = LEJP_MP_STRING_ESC_U1;
219 ctx->uni = 0;
220 break;
221 }
222 for (n = 0; n < sizeof(esc_char); n++) {
223 if (c != esc_char[n])
224 continue;
225 /* found it */
226 c = esc_tran[n];
227 ctx->st[ctx->sp].s = LEJP_MP_STRING;
228 goto emit_string_char;
229 }
230 ret = LEJP_REJECT_MP_STRING_ESC_ILLEGAL_ESC;
231 /* illegal escape char */
232 goto reject;
233
234 case LEJP_MP_STRING_ESC_U1:
235 case LEJP_MP_STRING_ESC_U2:
236 case LEJP_MP_STRING_ESC_U3:
237 case LEJP_MP_STRING_ESC_U4:
238 ctx->uni <<= 4;
239 if (c >= '0' && c <= '9')
240 ctx->uni |= c - '0';
241 else
242 if (c >= 'a' && c <= 'f')
243 ctx->uni = c - 'a' + 10;
244 else
245 if (c >= 'A' && c <= 'F')
246 ctx->uni = c - 'A' + 10;
247 else {
248 ret = LEJP_REJECT_ILLEGAL_HEX;
249 goto reject;
250 }
251 ctx->st[ctx->sp].s++;
252 switch (s) {
253 case LEJP_MP_STRING_ESC_U2:
254 if (ctx->uni < 0x08)
255 break;
256 /*
257 * 0x08-0xff (0x0800 - 0xffff)
258 * emit 3-byte UTF-8
259 */
260 c = 0xe0 | ((ctx->uni >> 4) & 0xf);
261 goto emit_string_char;
262
263 case LEJP_MP_STRING_ESC_U3:
264 if (ctx->uni >= 0x080) {
265 /*
266 * 0x080 - 0xfff (0x0800 - 0xffff)
267 * middle 3-byte seq
268 * send ....XXXXXX..
269 */
270 c = 0x80 | ((ctx->uni >> 2) & 0x3f);
271 goto emit_string_char;
272 }
273 if (ctx->uni < 0x008)
274 break;
275 /*
276 * 0x008 - 0x7f (0x0080 - 0x07ff)
277 * start 2-byte seq
278 */
279 c = 0xc0 | (ctx->uni >> 2);
280 goto emit_string_char;
281
282 case LEJP_MP_STRING_ESC_U4:
283 if (ctx->uni >= 0x0080)
284 /* end of 2 or 3-byte seq */
285 c = 0x80 | (ctx->uni & 0x3f);
286 else
287 /* literal */
288 c = (unsigned char)ctx->uni;
289
290 ctx->st[ctx->sp].s = LEJP_MP_STRING;
291 goto emit_string_char;
292 default:
293 break;
294 }
295 break;
296
297 case LEJP_MP_DELIM:
298 if (c != ':') {
299 ret = LEJP_REJECT_MP_DELIM_MISSING_COLON;
300 goto reject;
301 }
302 ctx->st[ctx->sp].s = LEJP_MP_VALUE;
303 ctx->path[ctx->ppos] = '\0';
304
305 lejp_check_path_match(ctx);
306 ctx->callback(ctx, LEJPCB_PAIR_NAME);
307 break;
308
309 case LEJP_MP_VALUE:
310 if (c >= '0' && c <= '9') {
311 ctx->npos = 0;
312 ctx->dcount = 0;
313 ctx->f = 0;
314 ctx->st[ctx->sp].s = LEJP_MP_VALUE_NUM_INT;
315 goto redo_character;
316 }
317 switch (c) {
318 case'\"':
319 /* push */
320 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
321 c = LEJP_MP_STRING;
322 ctx->npos = 0;
323 ctx->buf[0] = '\0';
324 ctx->callback(ctx, LEJPCB_VAL_STR_START);
325 goto add_stack_level;
326
327 case '{':
328 /* push */
329 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
330 c = LEJP_MEMBERS;
331 lejp_check_path_match(ctx);
332 ctx->callback(ctx, LEJPCB_OBJECT_START);
333 ctx->path_match = 0;
334 goto add_stack_level;
335
336 case '[':
337 /* push */
338 ctx->st[ctx->sp].s = LEJP_MP_ARRAY_END;
339 c = LEJP_MP_VALUE;
340 ctx->path[ctx->ppos++] = '[';
341 ctx->path[ctx->ppos++] = ']';
342 ctx->path[ctx->ppos] = '\0';
343 ctx->callback(ctx, LEJPCB_ARRAY_START);
344 ctx->i[ctx->ipos++] = 0;
345 if (ctx->ipos > ARRAY_SIZE(ctx->i)) {
346 ret = LEJP_REJECT_MP_DELIM_ISTACK;
347 goto reject;
348 }
349 goto add_stack_level;
350
351 case 't': /* true */
352 ctx->uni = 0;
353 ctx->st[ctx->sp].s = LEJP_MP_VALUE_TOK;
354 break;
355
356 case 'f':
357 ctx->uni = 4;
358 ctx->st[ctx->sp].s = LEJP_MP_VALUE_TOK;
359 break;
360
361 case 'n':
362 ctx->uni = 4 + 5;
363 ctx->st[ctx->sp].s = LEJP_MP_VALUE_TOK;
364 break;
365 default:
366 ret = LEJP_REJECT_MP_DELIM_BAD_VALUE_START;
367 goto reject;
368 }
369 break;
370
371 case LEJP_MP_VALUE_NUM_INT:
372 if (!ctx->npos && c == '-') {
373 ctx->f |= LEJP_SEEN_MINUS;
374 goto append_npos;
375 }
376
377 if (ctx->dcount < 10 && c >= '0' && c <= '9') {
378 if (ctx->f & LEJP_SEEN_POINT)
379 ctx->f |= LEJP_SEEN_POST_POINT;
380 ctx->dcount++;
381 goto append_npos;
382 }
383 if (c == '.') {
384 if (ctx->dcount || (ctx->f & LEJP_SEEN_POINT)) {
385 ret = LEJP_REJECT_MP_VAL_NUM_FORMAT;
386 goto reject;
387 }
388 ctx->f |= LEJP_SEEN_POINT;
389 goto append_npos;
390 }
391 /*
392 * before exponent, if we had . we must have had at
393 * least one more digit
394 */
395 if ((ctx->f &
396 (LEJP_SEEN_POINT | LEJP_SEEN_POST_POINT)) ==
397 LEJP_SEEN_POINT) {
398 ret = LEJP_REJECT_MP_VAL_NUM_INT_NO_FRAC;
399 goto reject;
400 }
401 if (c == 'e' || c == 'E') {
402 if (ctx->f & LEJP_SEEN_EXP) {
403 ret = LEJP_REJECT_MP_VAL_NUM_FORMAT;
404 goto reject;
405 }
406 ctx->f |= LEJP_SEEN_EXP;
407 ctx->st[ctx->sp].s = LEJP_MP_VALUE_NUM_EXP;
408 goto append_npos;
409 }
410 /* if none of the above, did we even have a number? */
411 if (!ctx->dcount) {
412 ret = LEJP_REJECT_MP_VAL_NUM_FORMAT;
413 goto reject;
414 }
415
416 ctx->buf[ctx->npos] = '\0';
417 if (ctx->f & LEJP_SEEN_POINT)
418 ctx->callback(ctx, LEJPCB_VAL_NUM_FLOAT);
419 else
420 ctx->callback(ctx, LEJPCB_VAL_NUM_INT);
421
422 /* then this is the post-number character, loop */
423 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
424 goto redo_character;
425
426 case LEJP_MP_VALUE_NUM_EXP:
427 ctx->st[ctx->sp].s = LEJP_MP_VALUE_NUM_INT;
428 if (c >= '0' && c <= '9')
429 goto redo_character;
430 if (c == '+' || c == '-')
431 goto append_npos;
432 ret = LEJP_REJECT_MP_VAL_NUM_EXP_BAD_EXP;
433 goto reject;
434
435 case LEJP_MP_VALUE_TOK: /* true, false, null */
436 if (c != tokens[ctx->uni]) {
437 ret = LEJP_REJECT_MP_VAL_TOK_UNKNOWN;
438 goto reject;
439 }
440 ctx->uni++;
441 if (tokens[ctx->uni] != ' ')
442 break;
443 switch (ctx->uni) {
444 case 3:
445 ctx->buf[0] = '1';
446 ctx->buf[1] = '\0';
447 ctx->callback(ctx, LEJPCB_VAL_TRUE);
448 break;
449 case 8:
450 ctx->buf[0] = '0';
451 ctx->buf[1] = '\0';
452 ctx->callback(ctx, LEJPCB_VAL_FALSE);
453 break;
454 case 12:
455 ctx->buf[0] = '\0';
456 ctx->callback(ctx, LEJPCB_VAL_NULL);
457 break;
458 }
459 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
460 break;
461
462 case LEJP_MP_COMMA_OR_END:
463 ctx->path[ctx->ppos] = '\0';
464 if (c == ',') {
465 /* increment this stack level's index */
466 ctx->st[ctx->sp].s = LEJP_M_P;
467 if (!ctx->sp) {
468 ctx->ppos = 0;
469 /*
470 * since we came back to root level,
471 * no path can still match
472 */
473 ctx->path_match = 0;
474 break;
475 }
476 ctx->ppos = ctx->st[ctx->sp - 1].p;
477 ctx->path[ctx->ppos] = '\0';
478 if (ctx->path_match &&
479 ctx->ppos <= ctx->path_match_len)
480 /*
481 * we shrank the path to be
482 * smaller than the matching point
483 */
484 ctx->path_match = 0;
485
486 if (ctx->st[ctx->sp - 1].s != LEJP_MP_ARRAY_END)
487 break;
488 /* top level is definitely an array... */
489 if (ctx->ipos)
490 ctx->i[ctx->ipos - 1]++;
491 ctx->st[ctx->sp].s = LEJP_MP_VALUE;
492 break;
493 }
494 if (c == ']') {
495 if (!ctx->sp) {
496 ret = LEJP_REJECT_MP_C_OR_E_UNDERF;
497 goto reject;
498 }
499 /* pop */
500 ctx->sp--;
501 if (ctx->st[ctx->sp].s != LEJP_MP_ARRAY_END) {
502 ret = LEJP_REJECT_MP_C_OR_E_NOTARRAY;
503 goto reject;
504 }
505 /* drop the path [n] bit */
506 ctx->ppos = ctx->st[ctx->sp - 1].p;
507 ctx->ipos = ctx->st[ctx->sp - 1].i;
508 ctx->path[ctx->ppos] = '\0';
509 if (ctx->path_match &&
510 ctx->ppos <= ctx->path_match_len)
511 /*
512 * we shrank the path to be
513 * smaller than the matching point
514 */
515 ctx->path_match = 0;
516
517 /* do LEJP_MP_ARRAY_END processing */
518 goto redo_character;
519 }
520 if (c == '}') {
521 if (ctx->sp == 0) {
522 lejp_check_path_match(ctx);
523 ctx->callback(ctx, LEJPCB_OBJECT_END);
524 ctx->callback(ctx, LEJPCB_COMPLETE);
525 /* done, return unused amount */
526 return len;
527 }
528 /* pop */
529 ctx->sp--;
530 ctx->ppos = ctx->st[ctx->sp - 1].p;
531 ctx->ipos = ctx->st[ctx->sp - 1].i;
532 ctx->path[ctx->ppos] = '\0';
533 if (ctx->path_match &&
534 ctx->ppos <= ctx->path_match_len)
535 /*
536 * we shrank the path to be
537 * smaller than the matching point
538 */
539 ctx->path_match = 0;
540 lejp_check_path_match(ctx);
541 ctx->callback(ctx, LEJPCB_OBJECT_END);
542 break;
543 }
544
545 ret = LEJP_REJECT_MP_C_OR_E_NEITHER;
546 goto reject;
547
548 case LEJP_MP_ARRAY_END:
549 ctx->path[ctx->ppos] = '\0';
550 if (c == ',') {
551 /* increment this stack level's index */
552 if (ctx->ipos)
553 ctx->i[ctx->ipos - 1]++;
554 ctx->st[ctx->sp].s = LEJP_MP_VALUE;
555 if (ctx->sp)
556 ctx->ppos = ctx->st[ctx->sp - 1].p;
557 ctx->path[ctx->ppos] = '\0';
558 break;
559 }
560 if (c != ']') {
561 ret = LEJP_REJECT_MP_ARRAY_END_MISSING;
562 goto reject;
563 }
564
565 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
566 ctx->callback(ctx, LEJPCB_ARRAY_END);
567 break;
568 }
569
570 continue;
571
572emit_string_char:
573 if (!ctx->sp || ctx->st[ctx->sp - 1].s != LEJP_MP_DELIM) {
574 /* assemble the string value into chunks */
575 ctx->buf[ctx->npos++] = c;
576 if (ctx->npos == sizeof(ctx->buf) - 1) {
577 ctx->callback(ctx, LEJPCB_VAL_STR_CHUNK);
578 ctx->npos = 0;
579 }
580 continue;
581 }
582 /* name part of name:value pair */
583 ctx->path[ctx->ppos++] = c;
584 continue;
585
586add_stack_level:
587 /* push on to the object stack */
588 if (ctx->ppos && ctx->st[ctx->sp].s != LEJP_MP_COMMA_OR_END &&
589 ctx->st[ctx->sp].s != LEJP_MP_ARRAY_END)
590 ctx->path[ctx->ppos++] = '.';
591
592 ctx->st[ctx->sp].p = ctx->ppos;
593 ctx->st[ctx->sp].i = ctx->ipos;
594 if (++ctx->sp == ARRAY_SIZE(ctx->st)) {
595 ret = LEJP_REJECT_STACK_OVERFLOW;
596 goto reject;
597 }
598 ctx->path[ctx->ppos] = '\0';
599 ctx->st[ctx->sp].s = c;
600 ctx->st[ctx->sp].b = 0;
601 continue;
602
603append_npos:
604 if (ctx->npos >= sizeof(ctx->buf)) {
605 ret = LEJP_REJECT_NUM_TOO_LONG;
606 goto reject;
607 }
608 ctx->buf[ctx->npos++] = c;
609 continue;
610
611redo_character:
612 json--;
613 len++;
614 }
615
616 return LEJP_CONTINUE;
617
618reject:
619 ctx->callback(ctx, LEJPCB_FAILED);
620 return ret;
621}