- parser.c parserInternals.c encoding.c: Since Notepad on Win2k
outputs a BOM in UTF8, an errata has been issued to avoid the
problem, that was the most reasonable solution... Add support
for a leading UTF8 BOM in entities.
Daniel
diff --git a/ChangeLog b/ChangeLog
index 800d971..7799b53 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+Wed Jun 20 19:37:25 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
+
+ * parser.c parserInternals.c encoding.c: Since Notepad on Win2k
+ outputs a BOM in UTF8, an errata has been issued to avoid the
+ problem, that was the most reasonable solution... Add support
+ for a leading UTF8 BOM in entities.
+
Wed Jun 20 15:38:59 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
* valid.c: fixed a bug found when post validating an entity ref
diff --git a/encoding.c b/encoding.c
index f86adf3..df760f7 100644
--- a/encoding.c
+++ b/encoding.c
@@ -1131,6 +1131,15 @@
(in[2] == 0x78) && (in[3] == 0x6D))
return(XML_CHAR_ENCODING_UTF8);
}
+ if (len >= 3) {
+ /*
+ * Errata on XML-1.0 June 20 2001
+ * We now allow an UTF8 encoded BOM
+ */
+ if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
+ (in[2] == 0xBF))
+ return(XML_CHAR_ENCODING_UTF8);
+ }
if (len >= 2) {
if ((in[0] == 0xFE) && (in[1] == 0xFF))
return(XML_CHAR_ENCODING_UTF16BE);
diff --git a/parser.c b/parser.c
index d7c7c56..f1e90ca 100644
--- a/parser.c
+++ b/parser.c
@@ -768,6 +768,9 @@
} else {
if ((entity->etype == XML_INTERNAL_PARAMETER_ENTITY) ||
(entity->etype == XML_EXTERNAL_PARAMETER_ENTITY)) {
+ xmlChar start[4];
+ xmlCharEncoding enc;
+
/*
* handle the extra spaces added before and after
* c.f. http://www.w3.org/TR/REC-xml#as-PE
@@ -775,6 +778,22 @@
*/
input = xmlNewEntityInputStream(ctxt, entity);
xmlPushInput(ctxt, input);
+
+ /*
+ * Get the 4 first bytes and decode the charset
+ * if enc != XML_CHAR_ENCODING_NONE
+ * plug some encoding conversion routines.
+ */
+ GROW
+ start[0] = RAW;
+ start[1] = NXT(1);
+ start[2] = NXT(2);
+ start[3] = NXT(3);
+ enc = xmlDetectCharEncoding(start, 4);
+ if (enc != XML_CHAR_ENCODING_NONE) {
+ xmlSwitchEncoding(ctxt, enc);
+ }
+
if ((entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) &&
(RAW == '<') && (NXT(1) == '?') &&
(NXT(2) == 'x') && (NXT(3) == 'm') &&
@@ -8585,6 +8604,7 @@
xmlDtdPtr ret = NULL;
xmlParserCtxtPtr ctxt;
xmlParserInputPtr pinput = NULL;
+ xmlChar start[4];
if (input == NULL)
return(NULL);
@@ -8634,6 +8654,23 @@
ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0");
ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none",
BAD_CAST "none", BAD_CAST "none");
+
+ if (enc == XML_CHAR_ENCODING_NONE) {
+ /*
+ * Get the 4 first bytes and decode the charset
+ * if enc != XML_CHAR_ENCODING_NONE
+ * plug some encoding conversion routines.
+ */
+ start[0] = RAW;
+ start[1] = NXT(1);
+ start[2] = NXT(2);
+ start[3] = NXT(3);
+ enc = xmlDetectCharEncoding(start, 4);
+ if (enc != XML_CHAR_ENCODING_NONE) {
+ xmlSwitchEncoding(ctxt, enc);
+ }
+ }
+
xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none");
if (ctxt->myDoc != NULL) {
@@ -8785,6 +8822,8 @@
xmlDocPtr newDoc;
xmlSAXHandlerPtr oldsax = NULL;
int ret = 0;
+ xmlChar start[4];
+ xmlCharEncoding enc;
if (ctx->depth > 40) {
return(XML_ERR_ENTITY_LOOP);
@@ -8832,10 +8871,24 @@
newDoc->children->doc = ctx->myDoc;
}
+ /*
+ * Get the 4 first bytes and decode the charset
+ * if enc != XML_CHAR_ENCODING_NONE
+ * plug some encoding conversion routines.
+ */
+ GROW
+ start[0] = RAW;
+ start[1] = NXT(1);
+ start[2] = NXT(2);
+ start[3] = NXT(3);
+ enc = xmlDetectCharEncoding(start, 4);
+ if (enc != XML_CHAR_ENCODING_NONE) {
+ xmlSwitchEncoding(ctxt, enc);
+ }
+
/*
* Parse a possible text declaration first
*/
- GROW;
if ((RAW == '<') && (NXT(1) == '?') &&
(NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
@@ -8946,6 +8999,8 @@
xmlDocPtr newDoc;
xmlSAXHandlerPtr oldsax = NULL;
int ret = 0;
+ xmlChar start[4];
+ xmlCharEncoding enc;
if (depth > 40) {
return(XML_ERR_ENTITY_LOOP);
@@ -9015,10 +9070,24 @@
newDoc->children->doc = doc;
}
+ /*
+ * Get the 4 first bytes and decode the charset
+ * if enc != XML_CHAR_ENCODING_NONE
+ * plug some encoding conversion routines.
+ */
+ GROW;
+ start[0] = RAW;
+ start[1] = NXT(1);
+ start[2] = NXT(2);
+ start[3] = NXT(3);
+ enc = xmlDetectCharEncoding(start, 4);
+ if (enc != XML_CHAR_ENCODING_NONE) {
+ xmlSwitchEncoding(ctxt, enc);
+ }
+
/*
* Parse a possible text declaration first
*/
- GROW;
if ((RAW == '<') && (NXT(1) == '?') &&
(NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
diff --git a/parserInternals.c b/parserInternals.c
index 4039c99..90b4812 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1569,6 +1569,17 @@
case XML_CHAR_ENCODING_UTF8:
/* default encoding, no conversion should be needed */
ctxt->charset = XML_CHAR_ENCODING_UTF8;
+
+ /*
+ * Errata on XML-1.0 June 20 2001
+ * Specific handling of the Byte Order Mark for
+ * UTF-8
+ */
+ if ((ctxt->input->cur[0] == 0xEF) &&
+ (ctxt->input->cur[1] == 0xBB) &&
+ (ctxt->input->cur[2] == 0xBF)) {
+ ctxt->input->cur += 3;
+ }
return(0);
default:
break;
@@ -1739,6 +1750,18 @@
(ctxt->input->cur[1] == 0xFF)) {
ctxt->input->cur += 2;
}
+ /*
+ * Errata on XML-1.0 June 20 2001
+ * Specific handling of the Byte Order Mark for
+ * UTF-8
+ */
+ if ((handler->name != NULL) &&
+ (!strcmp(handler->name, "UTF-8")) &&
+ (ctxt->input->cur[0] == 0xEF) &&
+ (ctxt->input->cur[1] == 0xBB) &&
+ (ctxt->input->cur[1] == 0xBF)) {
+ ctxt->input->cur += 3;
+ }
/*
* Shring the current input buffer.