optimizations to json_tokener_parse_ex(), printbuf_memappend()
-- Brent Miller, bdmiller at yahoo dash inc dot com
git-svn-id: http://svn.metaparadigm.com/svn/json-c/trunk@34 327403b1-1117-474d-bef2-5cb71233fd97
diff --git a/json_tokener.c b/json_tokener.c
index beaa956..8e8c6d9 100644
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -7,6 +7,10 @@
* This library is free software; you can redistribute it and/or modify
* it under the terms of the MIT license. See COPYING for details.
*
+ *
+ * Copyright (c) 2008-2009 Yahoo! Inc. All rights reserved.
+ * The copyrights to the contents of this file are licensed under the MIT License
+ * (http://www.opensource.org/licenses/mit-license.php)
*/
#include "config.h"
@@ -135,35 +139,68 @@
#define current tok->stack[tok->depth].current
#define obj_field_name tok->stack[tok->depth].obj_field_name
+/* Optimization:
+ * json_tokener_parse_ex() consumed a lot of CPU in its main loop,
+ * iterating character-by character. A large performance boost is
+ * achieved by using tighter loops to locally handle units such as
+ * comments and strings. Loops that handle an entire token within
+ * their scope also gather entire strings and pass them to
+ * printbuf_memappend() in a single call, rather than calling
+ * printbuf_memappend() one char at a time.
+ *
+ * POP_CHAR() and ADVANCE_CHAR() macros are used for code that is
+ * common to both the main loop and the tighter loops.
+ */
+
+/* POP_CHAR(dest, tok) macro:
+ * Not really a pop()...peeks at the current char and stores it in dest.
+ * Returns 1 on success, sets tok->err and returns 0 if no more chars.
+ * Implicit inputs: str, len vars
+ */
+#define POP_CHAR(dest, tok) \
+ (((tok)->char_offset == len) ? \
+ (((tok)->depth == 0 && state == json_tokener_state_eatws && saved_state == json_tokener_state_finish) ? \
+ (((tok)->err = json_tokener_success), 0) \
+ : \
+ (((tok)->err = json_tokener_continue), 0) \
+ ) : \
+ (((dest) = *str), 1) \
+ )
+
+/* ADVANCE_CHAR() macro:
+ * Incrementes str & tok->char_offset.
+ * For convenience of existing conditionals, returns the old value of c (0 on eof)
+ * Implicit inputs: c var
+ */
+#define ADVANCE_CHAR(str, tok) \
+ ( ++(str), ((tok)->char_offset)++, c)
+
+/* End optimization macro defs */
+
+
struct json_object* json_tokener_parse_ex(struct json_tokener *tok,
char *str, int len)
{
struct json_object *obj = NULL;
- char c;
+ char c = '\1';
tok->char_offset = 0;
tok->err = json_tokener_success;
- do {
- if(tok->char_offset == len) {
- if(tok->depth == 0 && state == json_tokener_state_eatws &&
- saved_state == json_tokener_state_finish)
- tok->err = json_tokener_success;
- else
- tok->err = json_tokener_continue;
- goto out;
- }
+ while (POP_CHAR(c, tok)) {
- c = *str;
redo_char:
switch(state) {
case json_tokener_state_eatws:
- if(isspace(c)) {
- /* okay */
- } else if(c == '/') {
+ /* Advance until we change state */
+ while (isspace(c)) {
+ if ((!ADVANCE_CHAR(str, tok)) || (!POP_CHAR(c, tok)))
+ goto out;
+ }
+ if(c == '/') {
printbuf_reset(tok->pb);
- printbuf_memappend(tok->pb, &c, 1);
+ printbuf_memappend_fast(tok->pb, &c, 1);
state = json_tokener_state_comment_start;
} else {
state = saved_state;
@@ -236,7 +273,7 @@
goto redo_char;
case json_tokener_state_null:
- printbuf_memappend(tok->pb, &c, 1);
+ printbuf_memappend_fast(tok->pb, &c, 1);
if(strncasecmp(json_null_str, tok->pb->buf,
min(tok->st_pos+1, strlen(json_null_str))) == 0) {
if(tok->st_pos == strlen(json_null_str)) {
@@ -261,25 +298,42 @@
tok->err = json_tokener_error_parse_comment;
goto out;
}
- printbuf_memappend(tok->pb, &c, 1);
+ printbuf_memappend_fast(tok->pb, &c, 1);
break;
case json_tokener_state_comment:
- if(c == '*') state = json_tokener_state_comment_end;
- printbuf_memappend(tok->pb, &c, 1);
- break;
+ {
+ /* Advance until we change state */
+ char *case_start = str;
+ while(c != '*') {
+ if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
+ printbuf_memappend_fast(tok->pb, case_start, str-case_start);
+ goto out;
+ }
+ }
+ printbuf_memappend_fast(tok->pb, case_start, 1+str-case_start);
+ state = json_tokener_state_comment_end;
+ }
+ break;
case json_tokener_state_comment_eol:
- if(c == '\n') {
+ {
+ /* Advance until we change state */
+ char *case_start = str;
+ while(c != '\n') {
+ if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
+ printbuf_memappend_fast(tok->pb, case_start, str-case_start);
+ goto out;
+ }
+ }
+ printbuf_memappend_fast(tok->pb, case_start, str-case_start);
MC_DEBUG("json_tokener_comment: %s\n", tok->pb->buf);
state = json_tokener_state_eatws;
- } else {
- printbuf_memappend(tok->pb, &c, 1);
}
break;
case json_tokener_state_comment_end:
- printbuf_memappend(tok->pb, &c, 1);
+ printbuf_memappend_fast(tok->pb, &c, 1);
if(c == '/') {
MC_DEBUG("json_tokener_comment: %s\n", tok->pb->buf);
state = json_tokener_state_eatws;
@@ -289,15 +343,27 @@
break;
case json_tokener_state_string:
- if(c == tok->quote_char) {
- current = json_object_new_string(tok->pb->buf);
- saved_state = json_tokener_state_finish;
- state = json_tokener_state_eatws;
- } else if(c == '\\') {
- saved_state = json_tokener_state_string;
- state = json_tokener_state_string_escape;
- } else {
- printbuf_memappend(tok->pb, &c, 1);
+ {
+ /* Advance until we change state */
+ char *case_start = str;
+ while(1) {
+ if(c == tok->quote_char) {
+ printbuf_memappend_fast(tok->pb, case_start, str-case_start);
+ current = json_object_new_string(tok->pb->buf);
+ saved_state = json_tokener_state_finish;
+ state = json_tokener_state_eatws;
+ break;
+ } else if(c == '\\') {
+ printbuf_memappend_fast(tok->pb, case_start, str-case_start);
+ saved_state = json_tokener_state_string;
+ state = json_tokener_state_string_escape;
+ break;
+ }
+ if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
+ printbuf_memappend_fast(tok->pb, case_start, str-case_start);
+ goto out;
+ }
+ }
}
break;
@@ -306,17 +372,17 @@
case '"':
case '\\':
case '/':
- printbuf_memappend(tok->pb, &c, 1);
+ printbuf_memappend_fast(tok->pb, &c, 1);
state = saved_state;
break;
case 'b':
case 'n':
case 'r':
case 't':
- if(c == 'b') printbuf_memappend(tok->pb, "\b", 1);
- else if(c == 'n') printbuf_memappend(tok->pb, "\n", 1);
- else if(c == 'r') printbuf_memappend(tok->pb, "\r", 1);
- else if(c == 't') printbuf_memappend(tok->pb, "\t", 1);
+ if(c == 'b') printbuf_memappend_fast(tok->pb, "\b", 1);
+ else if(c == 'n') printbuf_memappend_fast(tok->pb, "\n", 1);
+ else if(c == 'r') printbuf_memappend_fast(tok->pb, "\r", 1);
+ else if(c == 't') printbuf_memappend_fast(tok->pb, "\t", 1);
state = saved_state;
break;
case 'u':
@@ -331,33 +397,46 @@
break;
case json_tokener_state_escape_unicode:
- if(strchr(json_hex_chars, c)) {
- tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));
- if(tok->st_pos == 4) {
- unsigned char utf_out[3];
- if (tok->ucs_char < 0x80) {
- utf_out[0] = tok->ucs_char;
- printbuf_memappend(tok->pb, (char*)utf_out, 1);
- } else if (tok->ucs_char < 0x800) {
- utf_out[0] = 0xc0 | (tok->ucs_char >> 6);
- utf_out[1] = 0x80 | (tok->ucs_char & 0x3f);
- printbuf_memappend(tok->pb, (char*)utf_out, 2);
- } else {
- utf_out[0] = 0xe0 | (tok->ucs_char >> 12);
- utf_out[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
- utf_out[2] = 0x80 | (tok->ucs_char & 0x3f);
- printbuf_memappend(tok->pb, (char*)utf_out, 3);
- }
- state = saved_state;
+ /* Note that the following code is inefficient for handling large
+ * chunks of extended chars, calling printbuf_memappend() once
+ * for each multi-byte character of input.
+ * This is a good area for future optimization.
+ */
+ {
+ /* Advance until we change state */
+ while(1) {
+ if(strchr(json_hex_chars, c)) {
+ tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));
+ if(tok->st_pos == 4) {
+ unsigned char utf_out[3];
+ if (tok->ucs_char < 0x80) {
+ utf_out[0] = tok->ucs_char;
+ printbuf_memappend_fast(tok->pb, (char*)utf_out, 1);
+ } else if (tok->ucs_char < 0x800) {
+ utf_out[0] = 0xc0 | (tok->ucs_char >> 6);
+ utf_out[1] = 0x80 | (tok->ucs_char & 0x3f);
+ printbuf_memappend_fast(tok->pb, (char*)utf_out, 2);
+ } else {
+ utf_out[0] = 0xe0 | (tok->ucs_char >> 12);
+ utf_out[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
+ utf_out[2] = 0x80 | (tok->ucs_char & 0x3f);
+ printbuf_memappend_fast(tok->pb, (char*)utf_out, 3);
+ }
+ state = saved_state;
+ break;
+ }
+ } else {
+ tok->err = json_tokener_error_parse_string;
+ goto out;
+ }
+ if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok))
+ goto out;
}
- } else {
- tok->err = json_tokener_error_parse_string;
- goto out;
}
break;
case json_tokener_state_boolean:
- printbuf_memappend(tok->pb, &c, 1);
+ printbuf_memappend_fast(tok->pb, &c, 1);
if(strncasecmp(json_true_str, tok->pb->buf,
min(tok->st_pos+1, strlen(json_true_str))) == 0) {
if(tok->st_pos == strlen(json_true_str)) {
@@ -382,23 +461,35 @@
break;
case json_tokener_state_number:
- if(c && strchr(json_number_chars, c)) {
- printbuf_memappend(tok->pb, &c, 1);
- if(c == '.' || c == 'e' || c == 'E') tok->is_double = 1;
- } else {
- int numi;
- double numd;
- if(!tok->is_double && sscanf(tok->pb->buf, "%d", &numi) == 1) {
- current = json_object_new_int(numi);
- } else if(tok->is_double && sscanf(tok->pb->buf, "%lf", &numd) == 1) {
- current = json_object_new_double(numd);
- } else {
- tok->err = json_tokener_error_parse_number;
- goto out;
+ {
+ /* Advance until we change state */
+ char *case_start = str;
+ int case_len=0;
+ while(c && strchr(json_number_chars, c)) {
+ ++case_len;
+ if(c == '.' || c == 'e') tok->is_double = 1;
+ if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
+ printbuf_memappend_fast(tok->pb, case_start, case_len);
+ goto out;
+ }
}
- saved_state = json_tokener_state_finish;
- state = json_tokener_state_eatws;
- goto redo_char;
+ if (case_len>0)
+ printbuf_memappend_fast(tok->pb, case_start, case_len);
+ }
+ {
+ int numi;
+ double numd;
+ if(!tok->is_double && sscanf(tok->pb->buf, "%d", &numi) == 1) {
+ current = json_object_new_int(numi);
+ } else if(tok->is_double && sscanf(tok->pb->buf, "%lf", &numd) == 1) {
+ current = json_object_new_double(numd);
+ } else {
+ tok->err = json_tokener_error_parse_number;
+ goto out;
+ }
+ saved_state = json_tokener_state_finish;
+ state = json_tokener_state_eatws;
+ goto redo_char;
}
break;
@@ -452,15 +543,27 @@
break;
case json_tokener_state_object_field:
- if(c == tok->quote_char) {
- obj_field_name = strdup(tok->pb->buf);
- saved_state = json_tokener_state_object_field_end;
- state = json_tokener_state_eatws;
- } else if(c == '\\') {
- saved_state = json_tokener_state_object_field;
- state = json_tokener_state_string_escape;
- } else {
- printbuf_memappend(tok->pb, &c, 1);
+ {
+ /* Advance until we change state */
+ char *case_start = str;
+ while(1) {
+ if(c == tok->quote_char) {
+ printbuf_memappend_fast(tok->pb, case_start, str-case_start);
+ obj_field_name = strdup(tok->pb->buf);
+ saved_state = json_tokener_state_object_field_end;
+ state = json_tokener_state_eatws;
+ break;
+ } else if(c == '\\') {
+ printbuf_memappend_fast(tok->pb, case_start, str-case_start);
+ saved_state = json_tokener_state_object_field;
+ state = json_tokener_state_string_escape;
+ break;
+ }
+ if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
+ printbuf_memappend_fast(tok->pb, case_start, str-case_start);
+ goto out;
+ }
+ }
}
break;
@@ -506,15 +609,17 @@
break;
}
- str++;
- tok->char_offset++;
- } while(c);
-
- if(state != json_tokener_state_finish &&
- saved_state != json_tokener_state_finish)
- tok->err = json_tokener_error_parse_eof;
+ if (!ADVANCE_CHAR(str, tok))
+ goto out;
+ } /* while(POP_CHAR) */
out:
+ if (!c) { /* We hit an eof char (0) */
+ if(state != json_tokener_state_finish &&
+ saved_state != json_tokener_state_finish)
+ tok->err = json_tokener_error_parse_eof;
+ }
+
if(tok->err == json_tokener_success) return json_object_get(current);
MC_DEBUG("json_tokener_parse_ex: error %s at offset %d\n",
json_tokener_errors[tok->err], tok->char_offset);