blob: 20a07340bdc5cc37ca11e5ecca9bd53d37c275b7 [file] [log] [blame]
Andy Greencd0c6962016-03-28 10:12:37 +08001/*
2 * Lightweight Embedded JSON Parser
3 *
4 * Copyright (C) 2013 Andy Green <andy@warmcat.com>
5 * This code is licensed under LGPL 2.1
6 * http://www.gnu.org/licenses/lgpl-2.1.html
7 */
8
9#include <string.h>
10#include "lejp.h"
11
12#include <stdio.h>
13
14/**
15 * lejp_construct - prepare a struct lejp_ctx for use
16 *
17 * @ctx: pointer to your struct lejp_ctx
18 * @callback: your user callback which will received parsed tokens
19 * @user: optional user data pointer untouched by lejp
20 * @paths: your array of name elements you are interested in
21 * @count_paths: ARRAY_SIZE() of @paths
22 *
23 * Prepares your context struct for use with lejp
24 */
25
26void
27lejp_construct(struct lejp_ctx *ctx,
28 char (*callback)(struct lejp_ctx *ctx, char reason), void *user,
29 const char * const *paths, unsigned char count_paths)
30{
31 ctx->st[0].s = 0;
32 ctx->st[0].p = 0;
33 ctx->st[0].i = 0;
34 ctx->st[0].b = 0;
35 ctx->sp = 0;
36 ctx->ipos = 0;
37 ctx->ppos = 0;
38 ctx->path_match = 0;
39 ctx->path[0] = '\0';
40 ctx->callback = callback;
41 ctx->user = user;
42 ctx->paths = paths;
43 ctx->count_paths = count_paths;
44 ctx->line = 1;
45 ctx->callback(ctx, LEJPCB_CONSTRUCTED);
46}
47
48/**
49 * lejp_destruct - retire a previously constructed struct lejp_ctx
50 *
51 * @ctx: pointer to your struct lejp_ctx
52 *
53 * lejp does not perform any allocations, but since your user code might, this
54 * provides a one-time LEJPCB_DESTRUCTED callback at destruction time where
55 * you can clean up in your callback.
56 */
57
58void
59lejp_destruct(struct lejp_ctx *ctx)
60{
61 /* no allocations... just let callback know what it happening */
62 ctx->callback(ctx, LEJPCB_DESTRUCTED);
63}
64
65/**
66 * lejp_change_callback - switch to a different callback from now on
67 *
68 * @ctx: pointer to your struct lejp_ctx
69 * @callback: your user callback which will received parsed tokens
70 *
71 * This tells the old callback it was destroyed, in case you want to take any
72 * action because that callback "lost focus", then changes to the new
73 * callback and tells it first that it was constructed, and then started.
74 *
75 * Changing callback is a cheap and powerful trick to split out handlers
76 * according to information earlier in the parse. For example you may have
77 * a JSON pair "schema" whose value defines what can be expected for the rest
78 * of the JSON. Rather than having one huge callback for all cases, you can
79 * have an initial one looking for "schema" which then calls
80 * lejp_change_callback() to a handler specific for the schema.
81 *
82 * Notice that afterwards, you need to construct the context again anyway to
83 * parse another JSON object, and the callback is reset then to the main,
84 * schema-interpreting one. The construction action is very lightweight.
85 */
86
87void
88lejp_change_callback(struct lejp_ctx *ctx,
89 char (*callback)(struct lejp_ctx *ctx, char reason))
90{
91 ctx->callback(ctx, LEJPCB_DESTRUCTED);
92 ctx->callback = callback;
93 ctx->callback(ctx, LEJPCB_CONSTRUCTED);
94 ctx->callback(ctx, LEJPCB_START);
95}
96
97static void
98lejp_check_path_match(struct lejp_ctx *ctx)
99{
Andy Green37098ae2016-04-08 13:25:34 +0800100 const char *p, *q;
Andy Greencd0c6962016-03-28 10:12:37 +0800101 int n;
102
103 /* we only need to check if a match is not active */
104 for (n = 0; !ctx->path_match && n < ctx->count_paths; n++) {
Andy Green37098ae2016-04-08 13:25:34 +0800105 ctx->wildcount = 0;
106 p = ctx->path;
107 q = ctx->paths[n];
108 while (*p && *q) {
109 if (*q != '*') {
110 if (*p != *q)
111 break;
112 p++;
113 q++;
114 continue;
115 }
116 ctx->wild[ctx->wildcount++] = p - ctx->path;
117 q++;
118 while (*p && *p != '.')
119 p++;
120 }
121 if (*p || *q)
Andy Greencd0c6962016-03-28 10:12:37 +0800122 continue;
Andy Green37098ae2016-04-08 13:25:34 +0800123
Andy Greencd0c6962016-03-28 10:12:37 +0800124 ctx->path_match = n + 1;
125 ctx->path_match_len = ctx->ppos;
126 return;
127 }
Andy Green37098ae2016-04-08 13:25:34 +0800128
129 if (!ctx->path_match)
130 ctx->wildcount = 0;
131}
132
133int
134lejp_get_wildcard(struct lejp_ctx *ctx, int wildcard, char *dest, int len)
135{
136 int n;
137
138 if (wildcard >= ctx->wildcount || !len)
139 return 0;
140
141 n = ctx->wild[wildcard];
142
143 while (--len && n < ctx->ppos && ctx->path[n] != '.')
144 *dest++ = ctx->path[n++];
145
146 *dest = '\0';
147 n++;
148
149 return n - ctx->wild[wildcard];
Andy Greencd0c6962016-03-28 10:12:37 +0800150}
151
152/**
153 * lejp_parse - interpret some more incoming data incrementally
154 *
155 * @ctx: previously constructed parsing context
156 * @json: char buffer with the new data to interpret
157 * @len: amount of data in the buffer
158 *
159 * Because lejp is a stream parser, it incrementally parses as new data
160 * becomes available, maintaining all state in the context struct. So an
161 * incomplete JSON is a normal situation, getting you a LEJP_CONTINUE
162 * return, signalling there's no error but to call again with more data when
163 * it comes to complete the parsing. Successful parsing completes with a
164 * 0 or positive integer indicating how much of the last input buffer was
165 * unused.
166 */
167
168int
169lejp_parse(struct lejp_ctx *ctx, const unsigned char *json, int len)
170{
171 unsigned char c, n, s, ret = LEJP_REJECT_UNKNOWN;
172 static const char esc_char[] = "\"\\/bfnrt";
173 static const char esc_tran[] = "\"\\/\b\f\n\r\t";
174 static const char tokens[] = "rue alse ull ";
175
176 if (!ctx->sp && !ctx->ppos)
177 ctx->callback(ctx, LEJPCB_START);
178
179 while (len--) {
180 c = *json++;
181
182 s = ctx->st[ctx->sp].s;
183
184 /* skip whitespace unless we should care */
185 if (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '#') {
186 if (c == '\n') {
187 ctx->line++;
188 ctx->st[ctx->sp].s &= ~LEJP_FLAG_WS_COMMENTLINE;
189 }
190 if (!(s & LEJP_FLAG_WS_KEEP)) {
191 if (c == '#')
192 ctx->st[ctx->sp].s |=
193 LEJP_FLAG_WS_COMMENTLINE;
194 continue;
195 }
196 }
197
198 if (ctx->st[ctx->sp].s & LEJP_FLAG_WS_COMMENTLINE)
199 continue;
200
201 switch (s) {
202 case LEJP_IDLE:
203 if (c != '{') {
204 ret = LEJP_REJECT_IDLE_NO_BRACE;
205 goto reject;
206 }
207 ctx->callback(ctx, LEJPCB_OBJECT_START);
208 ctx->st[ctx->sp].s = LEJP_MEMBERS;
209 break;
210 case LEJP_MEMBERS:
211 if (c == '}') {
212 ctx->st[ctx->sp].s = LEJP_IDLE;
213 ret = LEJP_REJECT_MEMBERS_NO_CLOSE;
214 goto reject;
215 }
216 ctx->st[ctx->sp].s = LEJP_M_P;
217 goto redo_character;
218 case LEJP_M_P:
219 if (c != '\"') {
220 ret = LEJP_REJECT_MP_NO_OPEN_QUOTE;
221 goto reject;
222 }
223 /* push */
224 ctx->st[ctx->sp].s = LEJP_MP_DELIM;
225 c = LEJP_MP_STRING;
226 goto add_stack_level;
227
228 case LEJP_MP_STRING:
229 if (c == '\"') {
230 if (!ctx->sp) {
231 ret = LEJP_REJECT_MP_STRING_UNDERRUN;
232 goto reject;
233 }
234 if (ctx->st[ctx->sp - 1].s != LEJP_MP_DELIM) {
235 ctx->buf[ctx->npos] = '\0';
236 if (ctx->callback(ctx,
237 LEJPCB_VAL_STR_END) < 0) {
238 ret = LEJP_REJECT_CALLBACK;
239 goto reject;
240 }
241 }
242 /* pop */
243 ctx->sp--;
244 break;
245 }
246 if (c == '\\') {
247 ctx->st[ctx->sp].s = LEJP_MP_STRING_ESC;
248 break;
249 }
250 if (c < ' ') {/* "control characters" not allowed */
251 ret = LEJP_REJECT_MP_ILLEGAL_CTRL;
252 goto reject;
253 }
254 goto emit_string_char;
255
256 case LEJP_MP_STRING_ESC:
257 if (c == 'u') {
258 ctx->st[ctx->sp].s = LEJP_MP_STRING_ESC_U1;
259 ctx->uni = 0;
260 break;
261 }
262 for (n = 0; n < sizeof(esc_char); n++) {
263 if (c != esc_char[n])
264 continue;
265 /* found it */
266 c = esc_tran[n];
267 ctx->st[ctx->sp].s = LEJP_MP_STRING;
268 goto emit_string_char;
269 }
270 ret = LEJP_REJECT_MP_STRING_ESC_ILLEGAL_ESC;
271 /* illegal escape char */
272 goto reject;
273
274 case LEJP_MP_STRING_ESC_U1:
275 case LEJP_MP_STRING_ESC_U2:
276 case LEJP_MP_STRING_ESC_U3:
277 case LEJP_MP_STRING_ESC_U4:
278 ctx->uni <<= 4;
279 if (c >= '0' && c <= '9')
280 ctx->uni |= c - '0';
281 else
282 if (c >= 'a' && c <= 'f')
283 ctx->uni = c - 'a' + 10;
284 else
285 if (c >= 'A' && c <= 'F')
286 ctx->uni = c - 'A' + 10;
287 else {
288 ret = LEJP_REJECT_ILLEGAL_HEX;
289 goto reject;
290 }
291 ctx->st[ctx->sp].s++;
292 switch (s) {
293 case LEJP_MP_STRING_ESC_U2:
294 if (ctx->uni < 0x08)
295 break;
296 /*
297 * 0x08-0xff (0x0800 - 0xffff)
298 * emit 3-byte UTF-8
299 */
300 c = 0xe0 | ((ctx->uni >> 4) & 0xf);
301 goto emit_string_char;
302
303 case LEJP_MP_STRING_ESC_U3:
304 if (ctx->uni >= 0x080) {
305 /*
306 * 0x080 - 0xfff (0x0800 - 0xffff)
307 * middle 3-byte seq
308 * send ....XXXXXX..
309 */
310 c = 0x80 | ((ctx->uni >> 2) & 0x3f);
311 goto emit_string_char;
312 }
313 if (ctx->uni < 0x008)
314 break;
315 /*
316 * 0x008 - 0x7f (0x0080 - 0x07ff)
317 * start 2-byte seq
318 */
319 c = 0xc0 | (ctx->uni >> 2);
320 goto emit_string_char;
321
322 case LEJP_MP_STRING_ESC_U4:
323 if (ctx->uni >= 0x0080)
324 /* end of 2 or 3-byte seq */
325 c = 0x80 | (ctx->uni & 0x3f);
326 else
327 /* literal */
328 c = (unsigned char)ctx->uni;
329
330 ctx->st[ctx->sp].s = LEJP_MP_STRING;
331 goto emit_string_char;
332 default:
333 break;
334 }
335 break;
336
337 case LEJP_MP_DELIM:
338 if (c != ':') {
339 ret = LEJP_REJECT_MP_DELIM_MISSING_COLON;
340 goto reject;
341 }
342 ctx->st[ctx->sp].s = LEJP_MP_VALUE;
343 ctx->path[ctx->ppos] = '\0';
344
345 lejp_check_path_match(ctx);
346 ctx->callback(ctx, LEJPCB_PAIR_NAME);
347 break;
348
349 case LEJP_MP_VALUE:
350 if (c >= '0' && c <= '9') {
351 ctx->npos = 0;
352 ctx->dcount = 0;
353 ctx->f = 0;
354 ctx->st[ctx->sp].s = LEJP_MP_VALUE_NUM_INT;
355 goto redo_character;
356 }
357 switch (c) {
358 case'\"':
359 /* push */
360 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
361 c = LEJP_MP_STRING;
362 ctx->npos = 0;
363 ctx->buf[0] = '\0';
364 ctx->callback(ctx, LEJPCB_VAL_STR_START);
365 goto add_stack_level;
366
367 case '{':
368 /* push */
369 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
370 c = LEJP_MEMBERS;
371 lejp_check_path_match(ctx);
372 ctx->callback(ctx, LEJPCB_OBJECT_START);
373 ctx->path_match = 0;
374 goto add_stack_level;
375
376 case '[':
377 /* push */
378 ctx->st[ctx->sp].s = LEJP_MP_ARRAY_END;
379 c = LEJP_MP_VALUE;
380 ctx->path[ctx->ppos++] = '[';
381 ctx->path[ctx->ppos++] = ']';
382 ctx->path[ctx->ppos] = '\0';
383 ctx->callback(ctx, LEJPCB_ARRAY_START);
384 ctx->i[ctx->ipos++] = 0;
385 if (ctx->ipos > ARRAY_SIZE(ctx->i)) {
386 ret = LEJP_REJECT_MP_DELIM_ISTACK;
387 goto reject;
388 }
389 goto add_stack_level;
390
391 case 't': /* true */
392 ctx->uni = 0;
393 ctx->st[ctx->sp].s = LEJP_MP_VALUE_TOK;
394 break;
395
396 case 'f':
397 ctx->uni = 4;
398 ctx->st[ctx->sp].s = LEJP_MP_VALUE_TOK;
399 break;
400
401 case 'n':
402 ctx->uni = 4 + 5;
403 ctx->st[ctx->sp].s = LEJP_MP_VALUE_TOK;
404 break;
405 default:
406 ret = LEJP_REJECT_MP_DELIM_BAD_VALUE_START;
407 goto reject;
408 }
409 break;
410
411 case LEJP_MP_VALUE_NUM_INT:
412 if (!ctx->npos && c == '-') {
413 ctx->f |= LEJP_SEEN_MINUS;
414 goto append_npos;
415 }
416
417 if (ctx->dcount < 10 && c >= '0' && c <= '9') {
418 if (ctx->f & LEJP_SEEN_POINT)
419 ctx->f |= LEJP_SEEN_POST_POINT;
420 ctx->dcount++;
421 goto append_npos;
422 }
423 if (c == '.') {
424 if (ctx->dcount || (ctx->f & LEJP_SEEN_POINT)) {
425 ret = LEJP_REJECT_MP_VAL_NUM_FORMAT;
426 goto reject;
427 }
428 ctx->f |= LEJP_SEEN_POINT;
429 goto append_npos;
430 }
431 /*
432 * before exponent, if we had . we must have had at
433 * least one more digit
434 */
435 if ((ctx->f &
436 (LEJP_SEEN_POINT | LEJP_SEEN_POST_POINT)) ==
437 LEJP_SEEN_POINT) {
438 ret = LEJP_REJECT_MP_VAL_NUM_INT_NO_FRAC;
439 goto reject;
440 }
441 if (c == 'e' || c == 'E') {
442 if (ctx->f & LEJP_SEEN_EXP) {
443 ret = LEJP_REJECT_MP_VAL_NUM_FORMAT;
444 goto reject;
445 }
446 ctx->f |= LEJP_SEEN_EXP;
447 ctx->st[ctx->sp].s = LEJP_MP_VALUE_NUM_EXP;
448 goto append_npos;
449 }
450 /* if none of the above, did we even have a number? */
451 if (!ctx->dcount) {
452 ret = LEJP_REJECT_MP_VAL_NUM_FORMAT;
453 goto reject;
454 }
455
456 ctx->buf[ctx->npos] = '\0';
457 if (ctx->f & LEJP_SEEN_POINT)
458 ctx->callback(ctx, LEJPCB_VAL_NUM_FLOAT);
459 else
460 ctx->callback(ctx, LEJPCB_VAL_NUM_INT);
461
462 /* then this is the post-number character, loop */
463 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
464 goto redo_character;
465
466 case LEJP_MP_VALUE_NUM_EXP:
467 ctx->st[ctx->sp].s = LEJP_MP_VALUE_NUM_INT;
468 if (c >= '0' && c <= '9')
469 goto redo_character;
470 if (c == '+' || c == '-')
471 goto append_npos;
472 ret = LEJP_REJECT_MP_VAL_NUM_EXP_BAD_EXP;
473 goto reject;
474
475 case LEJP_MP_VALUE_TOK: /* true, false, null */
476 if (c != tokens[ctx->uni]) {
477 ret = LEJP_REJECT_MP_VAL_TOK_UNKNOWN;
478 goto reject;
479 }
480 ctx->uni++;
481 if (tokens[ctx->uni] != ' ')
482 break;
483 switch (ctx->uni) {
484 case 3:
485 ctx->buf[0] = '1';
486 ctx->buf[1] = '\0';
487 ctx->callback(ctx, LEJPCB_VAL_TRUE);
488 break;
489 case 8:
490 ctx->buf[0] = '0';
491 ctx->buf[1] = '\0';
492 ctx->callback(ctx, LEJPCB_VAL_FALSE);
493 break;
494 case 12:
495 ctx->buf[0] = '\0';
496 ctx->callback(ctx, LEJPCB_VAL_NULL);
497 break;
498 }
499 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
500 break;
501
502 case LEJP_MP_COMMA_OR_END:
503 ctx->path[ctx->ppos] = '\0';
504 if (c == ',') {
505 /* increment this stack level's index */
506 ctx->st[ctx->sp].s = LEJP_M_P;
507 if (!ctx->sp) {
508 ctx->ppos = 0;
509 /*
510 * since we came back to root level,
511 * no path can still match
512 */
513 ctx->path_match = 0;
514 break;
515 }
516 ctx->ppos = ctx->st[ctx->sp - 1].p;
517 ctx->path[ctx->ppos] = '\0';
518 if (ctx->path_match &&
519 ctx->ppos <= ctx->path_match_len)
520 /*
521 * we shrank the path to be
522 * smaller than the matching point
523 */
524 ctx->path_match = 0;
525
526 if (ctx->st[ctx->sp - 1].s != LEJP_MP_ARRAY_END)
527 break;
528 /* top level is definitely an array... */
529 if (ctx->ipos)
530 ctx->i[ctx->ipos - 1]++;
531 ctx->st[ctx->sp].s = LEJP_MP_VALUE;
532 break;
533 }
534 if (c == ']') {
535 if (!ctx->sp) {
536 ret = LEJP_REJECT_MP_C_OR_E_UNDERF;
537 goto reject;
538 }
539 /* pop */
540 ctx->sp--;
541 if (ctx->st[ctx->sp].s != LEJP_MP_ARRAY_END) {
542 ret = LEJP_REJECT_MP_C_OR_E_NOTARRAY;
543 goto reject;
544 }
545 /* drop the path [n] bit */
546 ctx->ppos = ctx->st[ctx->sp - 1].p;
547 ctx->ipos = ctx->st[ctx->sp - 1].i;
548 ctx->path[ctx->ppos] = '\0';
549 if (ctx->path_match &&
550 ctx->ppos <= ctx->path_match_len)
551 /*
552 * we shrank the path to be
553 * smaller than the matching point
554 */
555 ctx->path_match = 0;
556
557 /* do LEJP_MP_ARRAY_END processing */
558 goto redo_character;
559 }
560 if (c == '}') {
561 if (ctx->sp == 0) {
562 lejp_check_path_match(ctx);
563 ctx->callback(ctx, LEJPCB_OBJECT_END);
564 ctx->callback(ctx, LEJPCB_COMPLETE);
565 /* done, return unused amount */
566 return len;
567 }
568 /* pop */
569 ctx->sp--;
570 ctx->ppos = ctx->st[ctx->sp - 1].p;
571 ctx->ipos = ctx->st[ctx->sp - 1].i;
572 ctx->path[ctx->ppos] = '\0';
573 if (ctx->path_match &&
574 ctx->ppos <= ctx->path_match_len)
575 /*
576 * we shrank the path to be
577 * smaller than the matching point
578 */
579 ctx->path_match = 0;
580 lejp_check_path_match(ctx);
581 ctx->callback(ctx, LEJPCB_OBJECT_END);
582 break;
583 }
584
585 ret = LEJP_REJECT_MP_C_OR_E_NEITHER;
586 goto reject;
587
588 case LEJP_MP_ARRAY_END:
589 ctx->path[ctx->ppos] = '\0';
590 if (c == ',') {
591 /* increment this stack level's index */
592 if (ctx->ipos)
593 ctx->i[ctx->ipos - 1]++;
594 ctx->st[ctx->sp].s = LEJP_MP_VALUE;
595 if (ctx->sp)
596 ctx->ppos = ctx->st[ctx->sp - 1].p;
597 ctx->path[ctx->ppos] = '\0';
598 break;
599 }
600 if (c != ']') {
601 ret = LEJP_REJECT_MP_ARRAY_END_MISSING;
602 goto reject;
603 }
604
605 ctx->st[ctx->sp].s = LEJP_MP_COMMA_OR_END;
606 ctx->callback(ctx, LEJPCB_ARRAY_END);
607 break;
608 }
609
610 continue;
611
612emit_string_char:
613 if (!ctx->sp || ctx->st[ctx->sp - 1].s != LEJP_MP_DELIM) {
614 /* assemble the string value into chunks */
615 ctx->buf[ctx->npos++] = c;
616 if (ctx->npos == sizeof(ctx->buf) - 1) {
617 ctx->callback(ctx, LEJPCB_VAL_STR_CHUNK);
618 ctx->npos = 0;
619 }
620 continue;
621 }
622 /* name part of name:value pair */
623 ctx->path[ctx->ppos++] = c;
624 continue;
625
626add_stack_level:
627 /* push on to the object stack */
628 if (ctx->ppos && ctx->st[ctx->sp].s != LEJP_MP_COMMA_OR_END &&
629 ctx->st[ctx->sp].s != LEJP_MP_ARRAY_END)
630 ctx->path[ctx->ppos++] = '.';
631
632 ctx->st[ctx->sp].p = ctx->ppos;
633 ctx->st[ctx->sp].i = ctx->ipos;
634 if (++ctx->sp == ARRAY_SIZE(ctx->st)) {
635 ret = LEJP_REJECT_STACK_OVERFLOW;
636 goto reject;
637 }
638 ctx->path[ctx->ppos] = '\0';
639 ctx->st[ctx->sp].s = c;
640 ctx->st[ctx->sp].b = 0;
641 continue;
642
643append_npos:
644 if (ctx->npos >= sizeof(ctx->buf)) {
645 ret = LEJP_REJECT_NUM_TOO_LONG;
646 goto reject;
647 }
648 ctx->buf[ctx->npos++] = c;
649 continue;
650
651redo_character:
652 json--;
653 len++;
654 }
655
656 return LEJP_CONTINUE;
657
658reject:
659 ctx->callback(ctx, LEJPCB_FAILED);
660 return ret;
661}