Upgrade libxml2 to 1a360c1c2ec950f478d55b31722ecf78f5698e97
Also change upstream to github.
This change moves away from stable release. Because we need CMakeLists.txt. It is not in any release yet.
They are likely to release another stable version within this year. We can upgrade to that version when it is available.
Bug: 157157503
Change-Id: If6f245dbabe36a114563d209c8e100b7e3083f20
diff --git a/HTMLparser.c b/HTMLparser.c
index 9e60e27..b981298 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -69,7 +69,7 @@
/**
* htmlErrMemory:
* @ctxt: an HTML parser context
- * @extra: extra informations
+ * @extra: extra information
*
* Handle a redefinition of attribute error
*/
@@ -317,7 +317,7 @@
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
-/* Inported from XML */
+/* Imported from XML */
/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
#define CUR ((int) (*ctxt->input->cur))
@@ -414,6 +414,10 @@
static int
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
+ const unsigned char *cur;
+ unsigned char c;
+ unsigned int val;
+
if (ctxt->instate == XML_PARSER_EOF)
return(0);
@@ -421,99 +425,29 @@
*len = 0;
return(ctxt->token);
}
- if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
- /*
- * We are supposed to handle UTF8, check it's valid
- * From rfc2044: encoding of the Unicode values on UTF-8:
- *
- * UCS-4 range (hex.) UTF-8 octet sequence (binary)
- * 0000 0000-0000 007F 0xxxxxxx
- * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
- * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
- *
- * Check for the 0x110000 limit too
- */
- const unsigned char *cur = ctxt->input->cur;
- unsigned char c;
- unsigned int val;
-
- c = *cur;
- if (c & 0x80) {
- if (cur[1] == 0) {
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- cur = ctxt->input->cur;
- }
- if ((cur[1] & 0xc0) != 0x80)
- goto encoding_error;
- if ((c & 0xe0) == 0xe0) {
-
- if (cur[2] == 0) {
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- cur = ctxt->input->cur;
- }
- if ((cur[2] & 0xc0) != 0x80)
- goto encoding_error;
- if ((c & 0xf0) == 0xf0) {
- if (cur[3] == 0) {
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- cur = ctxt->input->cur;
- }
- if (((c & 0xf8) != 0xf0) ||
- ((cur[3] & 0xc0) != 0x80))
- goto encoding_error;
- /* 4-byte code */
- *len = 4;
- val = (cur[0] & 0x7) << 18;
- val |= (cur[1] & 0x3f) << 12;
- val |= (cur[2] & 0x3f) << 6;
- val |= cur[3] & 0x3f;
- } else {
- /* 3-byte code */
- *len = 3;
- val = (cur[0] & 0xf) << 12;
- val |= (cur[1] & 0x3f) << 6;
- val |= cur[2] & 0x3f;
- }
- } else {
- /* 2-byte code */
- *len = 2;
- val = (cur[0] & 0x1f) << 6;
- val |= cur[1] & 0x3f;
- }
- if (!IS_CHAR(val)) {
- htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
- "Char 0x%X out of allowed range\n", val);
- }
- return(val);
- } else {
- if ((*ctxt->input->cur == 0) &&
- (ctxt->input->cur < ctxt->input->end)) {
- htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
- "Char 0x%X out of allowed range\n", 0);
- *len = 1;
- return(' ');
- }
- /* 1-byte code */
- *len = 1;
- return((int) *ctxt->input->cur);
- }
- }
- /*
- * Assume it's a fixed length encoding (1) with
- * a compatible encoding for the ASCII set, since
- * XML constructs only use < 128 chars
- */
- *len = 1;
- if ((int) *ctxt->input->cur < 0x80)
- return((int) *ctxt->input->cur);
-
- /*
- * Humm this is bad, do an automatic flow conversion
- */
- {
+ if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
xmlChar * guess;
xmlCharEncodingHandlerPtr handler;
+ /*
+ * Assume it's a fixed length encoding (1) with
+ * a compatible encoding for the ASCII set, since
+ * HTML constructs only use < 128 chars
+ */
+ if ((int) *ctxt->input->cur < 0x80) {
+ *len = 1;
+ if ((*ctxt->input->cur == 0) &&
+ (ctxt->input->cur < ctxt->input->end)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Char 0x%X out of allowed range\n", 0);
+ return(' ');
+ }
+ return((int) *ctxt->input->cur);
+ }
+
+ /*
+ * Humm this is bad, do an automatic flow conversion
+ */
guess = htmlFindEncoding(ctxt);
if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
@@ -532,12 +466,91 @@
ctxt->charset = XML_CHAR_ENCODING_UTF8;
}
- return(xmlCurrentChar(ctxt, len));
+ /*
+ * We are supposed to handle UTF8, check it's valid
+ * From rfc2044: encoding of the Unicode values on UTF-8:
+ *
+ * UCS-4 range (hex.) UTF-8 octet sequence (binary)
+ * 0000 0000-0000 007F 0xxxxxxx
+ * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
+ *
+ * Check for the 0x110000 limit too
+ */
+ cur = ctxt->input->cur;
+ c = *cur;
+ if (c & 0x80) {
+ if ((c & 0x40) == 0)
+ goto encoding_error;
+ if (cur[1] == 0) {
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ cur = ctxt->input->cur;
+ }
+ if ((cur[1] & 0xc0) != 0x80)
+ goto encoding_error;
+ if ((c & 0xe0) == 0xe0) {
+
+ if (cur[2] == 0) {
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ cur = ctxt->input->cur;
+ }
+ if ((cur[2] & 0xc0) != 0x80)
+ goto encoding_error;
+ if ((c & 0xf0) == 0xf0) {
+ if (cur[3] == 0) {
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ cur = ctxt->input->cur;
+ }
+ if (((c & 0xf8) != 0xf0) ||
+ ((cur[3] & 0xc0) != 0x80))
+ goto encoding_error;
+ /* 4-byte code */
+ *len = 4;
+ val = (cur[0] & 0x7) << 18;
+ val |= (cur[1] & 0x3f) << 12;
+ val |= (cur[2] & 0x3f) << 6;
+ val |= cur[3] & 0x3f;
+ if (val < 0x10000)
+ goto encoding_error;
+ } else {
+ /* 3-byte code */
+ *len = 3;
+ val = (cur[0] & 0xf) << 12;
+ val |= (cur[1] & 0x3f) << 6;
+ val |= cur[2] & 0x3f;
+ if (val < 0x800)
+ goto encoding_error;
+ }
+ } else {
+ /* 2-byte code */
+ *len = 2;
+ val = (cur[0] & 0x1f) << 6;
+ val |= cur[1] & 0x3f;
+ if (val < 0x80)
+ goto encoding_error;
+ }
+ if (!IS_CHAR(val)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Char 0x%X out of allowed range\n", val);
+ }
+ return(val);
+ } else {
+ if ((*ctxt->input->cur == 0) &&
+ (ctxt->input->cur < ctxt->input->end)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Char 0x%X out of allowed range\n", 0);
+ *len = 1;
+ return(' ');
+ }
+ /* 1-byte code */
+ *len = 1;
+ return((int) *ctxt->input->cur);
+ }
encoding_error:
/*
* If we detect an UTF8 error that probably mean that the
- * input encoding didn't get properly advertized in the
+ * input encoding didn't get properly advertised in the
* declaration header. Report the error and switch the encoding
* to ISO-Latin-1 (if you don't like this policy, just declare the
* encoding !)
@@ -602,8 +615,8 @@
************************************************************************/
/*
- * Start Tag: 1 means the start tag can be ommited
- * End Tag: 1 means the end tag can be ommited
+ * Start Tag: 1 means the start tag can be omitted
+ * End Tag: 1 means the end tag can be omitted
* 2 means it's forbidden (empty elements)
* 3 means the tag is stylistic and should be closed easily
* Depr: this element is deprecated
@@ -1342,7 +1355,7 @@
if (xmlStrEqual(newtag, ctxt->nameTab[i]))
break;
/*
- * A missplaced endtag can only close elements with lower
+ * A misplaced endtag can only close elements with lower
* or equal priority, so if we find an element with higher
* priority before we find an element with
* matching name, we just ignore this endtag
@@ -2176,6 +2189,7 @@
* *
************************************************************************/
+#ifdef LIBXML_PUSH_ENABLED
/**
* htmlNewInputStream:
* @ctxt: an HTML parser context
@@ -2207,6 +2221,7 @@
input->length = 0;
return(input);
}
+#endif
/************************************************************************
@@ -2216,9 +2231,9 @@
************************************************************************/
/*
* all tags allowing pc data from the html 4.01 loose dtd
- * NOTE: it might be more apropriate to integrate this information
+ * NOTE: it might be more appropriate to integrate this information
* into the html40ElementTable array but I don't want to risk any
- * binary incomptibility
+ * binary incompatibility
*/
static const char *allowPCData[] = {
"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
@@ -2787,47 +2802,39 @@
static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
size_t len = 0, startPosition = 0;
+ int err = 0;
+ int quote;
xmlChar *ret = NULL;
- if (CUR == '"') {
- NEXT;
-
- if (CUR_PTR < BASE_PTR)
- return(ret);
- startPosition = CUR_PTR - BASE_PTR;
-
- while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
- NEXT;
- len++;
- }
- if (!IS_CHAR_CH(CUR)) {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
- "Unfinished SystemLiteral\n", NULL, NULL);
- } else {
- ret = xmlStrndup((BASE_PTR+startPosition), len);
- NEXT;
- }
- } else if (CUR == '\'') {
- NEXT;
-
- if (CUR_PTR < BASE_PTR)
- return(ret);
- startPosition = CUR_PTR - BASE_PTR;
-
- while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
- NEXT;
- len++;
- }
- if (!IS_CHAR_CH(CUR)) {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
- "Unfinished SystemLiteral\n", NULL, NULL);
- } else {
- ret = xmlStrndup((BASE_PTR+startPosition), len);
- NEXT;
- }
- } else {
+ if ((CUR != '"') && (CUR != '\'')) {
htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
- " or ' expected\n", NULL, NULL);
+ "SystemLiteral \" or ' expected\n", NULL, NULL);
+ return(NULL);
+ }
+ quote = CUR;
+ NEXT;
+
+ if (CUR_PTR < BASE_PTR)
+ return(ret);
+ startPosition = CUR_PTR - BASE_PTR;
+
+ while ((CUR != 0) && (CUR != quote)) {
+ /* TODO: Handle UTF-8 */
+ if (!IS_CHAR_CH(CUR)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in SystemLiteral 0x%X\n", CUR);
+ err = 1;
+ }
+ NEXT;
+ len++;
+ }
+ if (CUR != quote) {
+ htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
+ "Unfinished SystemLiteral\n", NULL, NULL);
+ } else {
+ NEXT;
+ if (err == 0)
+ ret = xmlStrndup((BASE_PTR+startPosition), len);
}
return(ret);
@@ -2847,51 +2854,42 @@
static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
size_t len = 0, startPosition = 0;
+ int err = 0;
+ int quote;
xmlChar *ret = NULL;
+
+ if ((CUR != '"') && (CUR != '\'')) {
+ htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
+ "PubidLiteral \" or ' expected\n", NULL, NULL);
+ return(NULL);
+ }
+ quote = CUR;
+ NEXT;
+
/*
* Name ::= (Letter | '_') (NameChar)*
*/
- if (CUR == '"') {
- NEXT;
+ if (CUR_PTR < BASE_PTR)
+ return(ret);
+ startPosition = CUR_PTR - BASE_PTR;
- if (CUR_PTR < BASE_PTR)
- return(ret);
- startPosition = CUR_PTR - BASE_PTR;
-
- while (IS_PUBIDCHAR_CH(CUR)) {
- len++;
- NEXT;
+ while ((CUR != 0) && (CUR != quote)) {
+ if (!IS_PUBIDCHAR_CH(CUR)) {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in PubidLiteral 0x%X\n", CUR);
+ err = 1;
}
-
- if (CUR != '"') {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
- "Unfinished PubidLiteral\n", NULL, NULL);
- } else {
- ret = xmlStrndup((BASE_PTR + startPosition), len);
- NEXT;
- }
- } else if (CUR == '\'') {
+ len++;
NEXT;
+ }
- if (CUR_PTR < BASE_PTR)
- return(ret);
- startPosition = CUR_PTR - BASE_PTR;
-
- while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
- len++;
- NEXT;
- }
-
- if (CUR != '\'') {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
- "Unfinished PubidLiteral\n", NULL, NULL);
- } else {
- ret = xmlStrndup((BASE_PTR + startPosition), len);
- NEXT;
- }
+ if (CUR != '"') {
+ htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
+ "Unfinished PubidLiteral\n", NULL, NULL);
} else {
- htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
- "PubidLiteral \" or ' expected\n", NULL, NULL);
+ NEXT;
+ if (err == 0)
+ ret = xmlStrndup((BASE_PTR + startPosition), len);
}
return(ret);
@@ -2926,7 +2924,7 @@
SHRINK;
cur = CUR_CHAR(l);
- while (IS_CHAR_CH(cur)) {
+ while (cur != 0) {
if ((cur == '<') && (NXT(1) == '/')) {
/*
* One should break here, the specification is clear:
@@ -2957,8 +2955,14 @@
}
}
}
- COPY_BUF(l,buf,nbchar,cur);
+ if (IS_CHAR(cur)) {
+ COPY_BUF(l,buf,nbchar,cur);
+ } else {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in CDATA 0x%X\n", cur);
+ }
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
+ buf[nbchar] = 0;
if (ctxt->sax->cdataBlock!= NULL) {
/*
* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
@@ -2974,15 +2978,8 @@
cur = CUR_CHAR(l);
}
- if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
- htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
- "Invalid char in CDATA 0x%X\n", cur);
- if (ctxt->input->cur < ctxt->input->end) {
- NEXT;
- }
- }
-
if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
+ buf[nbchar] = 0;
if (ctxt->sax->cdataBlock!= NULL) {
/*
* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
@@ -3028,6 +3025,8 @@
COPY_BUF(l,buf,nbchar,cur);
}
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
+ buf[nbchar] = 0;
+
/*
* Ok the segment is to be consumed as chars.
*/
@@ -3226,7 +3225,7 @@
}
SKIP_BLANKS;
cur = CUR_CHAR(l);
- while (IS_CHAR(cur) && (cur != '>')) {
+ while ((cur != 0) && (cur != '>')) {
if (len + 5 >= size) {
xmlChar *tmp;
@@ -3245,7 +3244,13 @@
GROW;
count = 0;
}
- COPY_BUF(l,buf,len,cur);
+ if (IS_CHAR(cur)) {
+ COPY_BUF(l,buf,len,cur);
+ } else {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in processing instruction "
+ "0x%X\n", cur);
+ }
NEXTL(l);
cur = CUR_CHAR(l);
if (cur == 0) {
@@ -3315,15 +3320,15 @@
len = 0;
buf[len] = 0;
q = CUR_CHAR(ql);
- if (!IS_CHAR(q))
+ if (q == 0)
goto unfinished;
NEXTL(ql);
r = CUR_CHAR(rl);
- if (!IS_CHAR(r))
+ if (r == 0)
goto unfinished;
NEXTL(rl);
cur = CUR_CHAR(l);
- while (IS_CHAR(cur) &&
+ while ((cur != 0) &&
((cur != '>') ||
(r != '-') || (q != '-'))) {
if (len + 5 >= size) {
@@ -3339,7 +3344,12 @@
}
buf = tmp;
}
- COPY_BUF(ql,buf,len,q);
+ if (IS_CHAR(q)) {
+ COPY_BUF(ql,buf,len,q);
+ } else {
+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
+ "Invalid char in comment 0x%X\n", q);
+ }
q = r;
ql = rl;
r = cur;
@@ -3353,7 +3363,7 @@
}
}
buf[len] = 0;
- if (IS_CHAR(cur)) {
+ if (cur == '>') {
NEXT;
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
(!ctxt->disableSAX))
@@ -3394,13 +3404,16 @@
((NXT(2) == 'x') || NXT(2) == 'X')) {
SKIP(3);
while (CUR != ';') {
- if ((CUR >= '0') && (CUR <= '9'))
- val = val * 16 + (CUR - '0');
- else if ((CUR >= 'a') && (CUR <= 'f'))
- val = val * 16 + (CUR - 'a') + 10;
- else if ((CUR >= 'A') && (CUR <= 'F'))
- val = val * 16 + (CUR - 'A') + 10;
- else {
+ if ((CUR >= '0') && (CUR <= '9')) {
+ if (val < 0x110000)
+ val = val * 16 + (CUR - '0');
+ } else if ((CUR >= 'a') && (CUR <= 'f')) {
+ if (val < 0x110000)
+ val = val * 16 + (CUR - 'a') + 10;
+ } else if ((CUR >= 'A') && (CUR <= 'F')) {
+ if (val < 0x110000)
+ val = val * 16 + (CUR - 'A') + 10;
+ } else {
htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
"htmlParseCharRef: missing semicolon\n",
NULL, NULL);
@@ -3413,9 +3426,10 @@
} else if ((CUR == '&') && (NXT(1) == '#')) {
SKIP(2);
while (CUR != ';') {
- if ((CUR >= '0') && (CUR <= '9'))
- val = val * 10 + (CUR - '0');
- else {
+ if ((CUR >= '0') && (CUR <= '9')) {
+ if (val < 0x110000)
+ val = val * 10 + (CUR - '0');
+ } else {
htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
"htmlParseCharRef: missing semicolon\n",
NULL, NULL);
@@ -3434,6 +3448,9 @@
*/
if (IS_CHAR(val)) {
return(val);
+ } else if (val >= 0x110000) {
+ htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
+ "htmlParseCharRef: value too large\n", NULL, NULL);
} else {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"htmlParseCharRef: invalid xmlChar value %d\n",
@@ -3493,9 +3510,12 @@
if (CUR != '>') {
htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
"DOCTYPE improperly terminated\n", NULL, NULL);
- /* We shouldn't try to resynchronize ... */
+ /* Ignore bogus content */
+ while ((CUR != 0) && (CUR != '>'))
+ NEXT;
}
- NEXT;
+ if (CUR == '>')
+ NEXT;
/*
* Create or update the document accordingly to the DOCTYPE
@@ -3773,7 +3793,7 @@
/* Dump the bogus tag like browsers do */
- while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
+ while ((CUR != 0) && (CUR != '>') &&
(ctxt->instate != XML_PARSER_EOF))
NEXT;
return -1;
@@ -3829,7 +3849,7 @@
* (S Attribute)* S?
*/
SKIP_BLANKS;
- while ((IS_CHAR_CH(CUR)) &&
+ while ((CUR != 0) &&
(CUR != '>') &&
((CUR != '/') || (NXT(1) != '>'))) {
long cons = ctxt->nbChars;
@@ -3892,7 +3912,7 @@
xmlFree(attvalue);
/* Dump the bogus attribute string up to the next blank or
* the end of the tag. */
- while ((IS_CHAR_CH(CUR)) &&
+ while ((CUR != 0) &&
!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
((CUR != '/') || (NXT(1) != '>')))
NEXT;
@@ -3973,19 +3993,14 @@
* We should definitely be at the ending "S? '>'" part
*/
SKIP_BLANKS;
- if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
+ if (CUR != '>') {
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
"End tag : expected '>'\n", NULL, NULL);
- if (ctxt->recovery) {
- /*
- * We're not at the ending > !!
- * Error, unless in recover mode where we search forwards
- * until we find a >
- */
- while (CUR != '\0' && CUR != '>') NEXT;
- NEXT;
- }
- } else
+ /* Skip to next '>' */
+ while ((CUR != 0) && (CUR != '>'))
+ NEXT;
+ }
+ if (CUR == '>')
NEXT;
/*
@@ -4175,7 +4190,7 @@
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
/* Dump the bogus tag like browsers do */
- while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
+ while ((CUR != 0) && (CUR != '>'))
NEXT;
if (currentNode != NULL)
@@ -4390,7 +4405,7 @@
*/
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
- while (IS_CHAR_CH(CUR)) {
+ while (CUR != 0) {
oldptr = ctxt->input->cur;
htmlParseContent(ctxt);
if (oldptr==ctxt->input->cur) break;
@@ -4407,7 +4422,7 @@
node_info.node = ctxt->node;
xmlParserAddNodeInfo(ctxt, &node_info);
}
- if (!IS_CHAR_CH(CUR)) {
+ if (CUR == 0) {
htmlAutoCloseOnEnd(ctxt);
}
@@ -4428,7 +4443,7 @@
xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
htmlNodeInfoPop(ctxt);
}
- if (!IS_CHAR_CH(CUR)) {
+ if (CUR == 0) {
htmlAutoCloseOnEnd(ctxt);
}
}
@@ -4577,7 +4592,7 @@
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
/* Dump the bogus tag like browsers do */
- while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
+ while ((CUR == 0) && (CUR != '>'))
NEXT;
htmlParserFinishElementParsing(ctxt);
@@ -5113,7 +5128,7 @@
* @first: the first char to lookup
* @next: the next char to lookup or zero
* @third: the next char to lookup or zero
- * @comment: flag to force checking inside comments
+ * @ignoreattrval: skip over attribute values
*
* Try to find if a sequence (first, next, third) or just (first next) or
* (first) is available in the input stream.
@@ -5127,13 +5142,11 @@
*/
static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
- xmlChar next, xmlChar third, int iscomment,
- int ignoreattrval)
+ xmlChar next, xmlChar third, int ignoreattrval)
{
int base, len;
htmlParserInputPtr in;
const xmlChar *buf;
- int incomment = 0;
int invalue = 0;
char valdellim = 0x0;
@@ -5145,8 +5158,11 @@
if (base < 0)
return (-1);
- if (ctxt->checkIndex > base)
+ if (ctxt->checkIndex > base) {
base = ctxt->checkIndex;
+ /* Abuse hasPErefs member to restore current state. */
+ invalue = ctxt->hasPErefs & 1 ? 1 : 0;
+ }
if (in->buf == NULL) {
buf = in->base;
@@ -5162,14 +5178,6 @@
else if (next)
len--;
for (; base < len; base++) {
- if ((!incomment) && (base + 4 < len) && (!iscomment)) {
- if ((buf[base] == '<') && (buf[base + 1] == '!') &&
- (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
- incomment = 1;
- /* do not increment past <! - some people use <!--> */
- base += 2;
- }
- }
if (ignoreattrval) {
if (buf[base] == '"' || buf[base] == '\'') {
if (invalue) {
@@ -5186,16 +5194,6 @@
continue;
}
}
- if (incomment) {
- if (base + 3 > len)
- return (-1);
- if ((buf[base] == '-') && (buf[base + 1] == '-') &&
- (buf[base + 2] == '>')) {
- incomment = 0;
- base += 2;
- }
- continue;
- }
if (buf[base] == first) {
if (third != 0) {
if ((buf[base + 1] != next) || (buf[base + 2] != third))
@@ -5222,8 +5220,12 @@
return (base - (in->cur - in->base));
}
}
- if ((!incomment) && (!invalue))
- ctxt->checkIndex = base;
+ ctxt->checkIndex = base;
+ /* Abuse hasPErefs member to track current state. */
+ if (invalue)
+ ctxt->hasPErefs |= 1;
+ else
+ ctxt->hasPErefs &= ~1;
#ifdef DEBUG_PUSH
if (next == 0)
xmlGenericError(xmlGenericErrorContext,
@@ -5240,80 +5242,6 @@
}
/**
- * htmlParseLookupChars:
- * @ctxt: an HTML parser context
- * @stop: Array of chars, which stop the lookup.
- * @stopLen: Length of stop-Array
- *
- * Try to find if any char of the stop-Array is available in the input
- * stream.
- * This function has a side effect of (possibly) incrementing ctxt->checkIndex
- * to avoid rescanning sequences of bytes, it DOES change the state of the
- * parser, do not use liberally.
- *
- * Returns the index to the current parsing point if a stopChar
- * is available, -1 otherwise.
- */
-static int
-htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
- int stopLen)
-{
- int base, len;
- htmlParserInputPtr in;
- const xmlChar *buf;
- int incomment = 0;
- int i;
-
- in = ctxt->input;
- if (in == NULL)
- return (-1);
-
- base = in->cur - in->base;
- if (base < 0)
- return (-1);
-
- if (ctxt->checkIndex > base)
- base = ctxt->checkIndex;
-
- if (in->buf == NULL) {
- buf = in->base;
- len = in->length;
- } else {
- buf = xmlBufContent(in->buf->buffer);
- len = xmlBufUse(in->buf->buffer);
- }
-
- for (; base < len; base++) {
- if (!incomment && (base + 4 < len)) {
- if ((buf[base] == '<') && (buf[base + 1] == '!') &&
- (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
- incomment = 1;
- /* do not increment past <! - some people use <!--> */
- base += 2;
- }
- }
- if (incomment) {
- if (base + 3 > len)
- return (-1);
- if ((buf[base] == '-') && (buf[base + 1] == '-') &&
- (buf[base + 2] == '>')) {
- incomment = 0;
- base += 2;
- }
- continue;
- }
- for (i = 0; i < stopLen; ++i) {
- if (buf[base] == stop[i]) {
- ctxt->checkIndex = 0;
- return (base - (in->cur - in->base));
- }
- }
- }
- ctxt->checkIndex = base;
- return (-1);
-}
-
-/**
* htmlParseTryOrFinish:
* @ctxt: an HTML parser context
* @terminate: last chunk indicator
@@ -5326,7 +5254,7 @@
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int ret = 0;
htmlParserInputPtr in;
- int avail = 0;
+ ptrdiff_t avail = 0;
xmlChar cur, next;
htmlParserNodeInfo node_info;
@@ -5391,7 +5319,8 @@
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
if ((avail == 0) && (terminate)) {
htmlAutoCloseOnEnd(ctxt);
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
@@ -5405,6 +5334,12 @@
}
if (avail < 1)
goto done;
+ /*
+ * This is done to make progress and avoid an infinite loop
+ * if a parsing attempt was aborted by hitting a NUL byte. After
+ * changing htmlCurrentChar, this probably isn't necessary anymore.
+ * We should consider removing this check.
+ */
cur = in->cur[0];
if (cur == 0) {
SKIP(1);
@@ -5427,7 +5362,8 @@
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
}
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData,
@@ -5444,7 +5380,7 @@
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5469,14 +5405,15 @@
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
/*
* no chars in buffer
*/
if (avail < 1)
goto done;
/*
- * not enouth chars in buffer
+ * not enough chars in buffer
*/
if (avail < 2) {
if (!terminate)
@@ -5490,7 +5427,7 @@
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5500,7 +5437,7 @@
ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5514,7 +5451,7 @@
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5530,7 +5467,7 @@
(avail < 9)) {
goto done;
} else {
- ctxt->instate = XML_PARSER_START_TAG;
+ ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: entering START_TAG\n");
@@ -5542,7 +5479,8 @@
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
if (avail < 2)
goto done;
cur = in->cur[0];
@@ -5550,7 +5488,7 @@
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5560,7 +5498,7 @@
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5572,7 +5510,7 @@
(avail < 4)) {
goto done;
} else {
- ctxt->instate = XML_PARSER_START_TAG;
+ ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: entering START_TAG\n");
@@ -5583,7 +5521,8 @@
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
+ (in->cur - in->base);
if (avail < 1)
goto done;
cur = in->cur[0];
@@ -5597,7 +5536,7 @@
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5607,7 +5546,7 @@
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5642,7 +5581,7 @@
if (avail < 1)
goto done;
/*
- * not enouth chars in buffer
+ * not enough chars in buffer
*/
if (avail < 2) {
if (!terminate)
@@ -5671,7 +5610,7 @@
break;
}
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done;
/* Capture start position */
@@ -5762,13 +5701,13 @@
break;
}
case XML_PARSER_CONTENT: {
+ xmlChar chr[2] = { 0, 0 };
long cons;
+
/*
* Handle preparsed entities and charRef
*/
if (ctxt->token != 0) {
- xmlChar chr[2] = { 0 , 0 } ;
-
chr[0] = (xmlChar) ctxt->token;
htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
@@ -5780,21 +5719,22 @@
cur = in->cur[0];
if ((cur != '<') && (cur != '&')) {
if (ctxt->sax != NULL) {
+ chr[0] = cur;
if (IS_BLANK_CH(cur)) {
if (ctxt->keepBlanks) {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(
- ctxt->userData, &in->cur[0], 1);
+ ctxt->userData, chr, 1);
} else {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(
- ctxt->userData, &in->cur[0], 1);
+ ctxt->userData, chr, 1);
}
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(
- ctxt->userData, &in->cur[0], 1);
+ ctxt->userData, chr, 1);
}
}
ctxt->token = 0;
@@ -5817,7 +5757,7 @@
int idx;
xmlChar val;
- idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
+ idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
if (idx < 0)
goto done;
val = in->cur[idx + 2];
@@ -5844,7 +5784,7 @@
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done;
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"Misplaced DOCTYPE declaration\n",
@@ -5854,7 +5794,7 @@
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
(htmlParseLookupSequence(
- ctxt, '-', '-', '>', 1, 1) < 0))
+ ctxt, '-', '-', '>', 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5864,7 +5804,7 @@
ctxt->instate = XML_PARSER_CONTENT;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5883,24 +5823,35 @@
#endif
break;
} else if (cur == '<') {
- ctxt->instate = XML_PARSER_START_TAG;
- ctxt->checkIndex = 0;
+ if ((!terminate) && (next == 0))
+ goto done;
+ /*
+ * Only switch to START_TAG if the next character
+ * starts a valid name. Otherwise, htmlParseStartTag
+ * might return without consuming all characters
+ * up to the final '>'.
+ */
+ if ((IS_ASCII_LETTER(next)) ||
+ (next == '_') || (next == ':') || (next == '.')) {
+ ctxt->instate = XML_PARSER_START_TAG;
+ ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
- xmlGenericError(xmlGenericErrorContext,
- "HPP: entering START_TAG\n");
+ xmlGenericError(xmlGenericErrorContext,
+ "HPP: entering START_TAG\n");
#endif
+ } else {
+ htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
+ "htmlParseTryOrFinish: "
+ "invalid element name\n",
+ NULL, NULL);
+ htmlCheckParagraph(ctxt);
+ if ((ctxt->sax != NULL) &&
+ (ctxt->sax->characters != NULL))
+ ctxt->sax->characters(ctxt->userData,
+ in->cur, 1);
+ NEXT;
+ }
break;
- } else if (cur == '&') {
- if ((!terminate) &&
- (htmlParseLookupChars(ctxt,
- BAD_CAST "; >/", 4) < 0))
- goto done;
-#ifdef DEBUG_PUSH
- xmlGenericError(xmlGenericErrorContext,
- "HPP: Parsing Reference\n");
-#endif
- /* TODO: check generation of subtrees if noent !!! */
- htmlParseReference(ctxt);
} else {
/*
* check that the text sequence is complete
@@ -5909,14 +5860,21 @@
* data detection.
*/
if ((!terminate) &&
- (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
+ (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
goto done;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
"HPP: Parsing char data\n");
#endif
- htmlParseCharData(ctxt);
+ while ((cur != '<') && (cur != 0)) {
+ if (cur == '&') {
+ htmlParseReference(ctxt);
+ } else {
+ htmlParseCharData(ctxt);
+ }
+ cur = in->cur[0];
+ }
}
}
if (cons == ctxt->nbChars) {
@@ -5935,7 +5893,7 @@
if (avail < 2)
goto done;
if ((!terminate) &&
- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
htmlParseEndTag(ctxt);
if (ctxt->nameNr == 0) {
@@ -6117,12 +6075,12 @@
int res;
res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
+ xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
if (res < 0) {
ctxt->errNo = XML_PARSER_EOF;
ctxt->disableSAX = 1;
return (XML_PARSER_EOF);
}
- xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
#endif
@@ -6141,12 +6099,12 @@
size_t current = ctxt->input->cur - ctxt->input->base;
nbchars = xmlCharEncInput(in, terminate);
+ xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
if (nbchars < 0) {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"encoder error\n", NULL, NULL);
return(XML_ERR_INVALID_ENCODING);
}
- xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
}
}
}