- parser.c parserInternals.c encoding.c: Since Notepad on Win2k outputs a BOM in UTF8, an errata has been issued to avoid the problem, that was the most reasonable solution... Add support for a leading UTF8 BOM in entities. Daniel

commit: 87a764ed851c56eb6cf805c80ec96972224e6834 [log] [tgz]
author: Daniel Veillard <veillard@src.gnome.org> Wed Jun 20 17:41:10 2001 +0000
committer: Daniel Veillard <veillard@src.gnome.org> Wed Jun 20 17:41:10 2001 +0000
tree: 0a4deea0679dff5c345129b1807150f6e5d675af
parent: 10ea86cba4b8bc812223e526cf2bc171464e494d [diff]
diff --git a/ChangeLog b/ChangeLog
index 800d971..7799b53 100644
--- a/ChangeLog
+++ b/ChangeLog

@@ -1,3 +1,10 @@
+Wed Jun 20 19:37:25 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
+
+	* parser.c parserInternals.c encoding.c: Since Notepad on Win2k
+	  outputs a BOM in UTF8, an errata has been issued to avoid the
+	  problem, that was the most reasonable solution... Add support
+	  for a leading UTF8 BOM in entities.
+
 Wed Jun 20 15:38:59 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
 
 	* valid.c: fixed a bug found when post validating an entity ref

diff --git a/encoding.c b/encoding.c
index f86adf3..df760f7 100644
--- a/encoding.c
+++ b/encoding.c

@@ -1131,6 +1131,15 @@
 	    (in[2] == 0x78) && (in[3] == 0x6D))
 	    return(XML_CHAR_ENCODING_UTF8);
     }
+    if (len >= 3) {
+	/*
+	 * Errata on XML-1.0 June 20 2001
+	 * We now allow an UTF8 encoded BOM
+	 */
+	if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
+	    (in[2] == 0xBF))
+	    return(XML_CHAR_ENCODING_UTF8);
+    }
     if (len >= 2) {
 	if ((in[0] == 0xFE) && (in[1] == 0xFF))
 	    return(XML_CHAR_ENCODING_UTF16BE);

diff --git a/parser.c b/parser.c
index d7c7c56..f1e90ca 100644
--- a/parser.c
+++ b/parser.c

@@ -768,6 +768,9 @@
 	    } else {
 	        if ((entity->etype == XML_INTERNAL_PARAMETER_ENTITY) ||
 		    (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY)) {
+		    xmlChar start[4];
+		    xmlCharEncoding enc;
+
 		    /*
 		     * handle the extra spaces added before and after
 		     * c.f. http://www.w3.org/TR/REC-xml#as-PE
@@ -775,6 +778,22 @@
 		     */
 		    input = xmlNewEntityInputStream(ctxt, entity);
 		    xmlPushInput(ctxt, input);
+
+		    /* 
+		     * Get the 4 first bytes and decode the charset
+		     * if enc != XML_CHAR_ENCODING_NONE
+		     * plug some encoding conversion routines.
+		     */
+		    GROW
+		    start[0] = RAW;
+		    start[1] = NXT(1);
+		    start[2] = NXT(2);
+		    start[3] = NXT(3);
+		    enc = xmlDetectCharEncoding(start, 4);
+		    if (enc != XML_CHAR_ENCODING_NONE) {
+			xmlSwitchEncoding(ctxt, enc);
+		    }
+
 		    if ((entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) &&
 			(RAW == '<') && (NXT(1) == '?') &&
 			(NXT(2) == 'x') && (NXT(3) == 'm') &&
@@ -8585,6 +8604,7 @@
     xmlDtdPtr ret = NULL;
     xmlParserCtxtPtr ctxt;
     xmlParserInputPtr pinput = NULL;
+    xmlChar start[4];
 
     if (input == NULL)
 	return(NULL);
@@ -8634,6 +8654,23 @@
     ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0");
     ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none",
 	                               BAD_CAST "none", BAD_CAST "none");
+
+    if (enc == XML_CHAR_ENCODING_NONE) {
+	/* 
+	 * Get the 4 first bytes and decode the charset
+	 * if enc != XML_CHAR_ENCODING_NONE
+	 * plug some encoding conversion routines.
+	 */
+	start[0] = RAW;
+	start[1] = NXT(1);
+	start[2] = NXT(2);
+	start[3] = NXT(3);
+	enc = xmlDetectCharEncoding(start, 4);
+	if (enc != XML_CHAR_ENCODING_NONE) {
+	    xmlSwitchEncoding(ctxt, enc);
+	}
+    }
+
     xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none");
 
     if (ctxt->myDoc != NULL) {
@@ -8785,6 +8822,8 @@
     xmlDocPtr newDoc;
     xmlSAXHandlerPtr oldsax = NULL;
     int ret = 0;
+    xmlChar start[4];
+    xmlCharEncoding enc;
 
     if (ctx->depth > 40) {
 	return(XML_ERR_ENTITY_LOOP);
@@ -8832,10 +8871,24 @@
 	newDoc->children->doc = ctx->myDoc;
     }
 
+    /* 
+     * Get the 4 first bytes and decode the charset
+     * if enc != XML_CHAR_ENCODING_NONE
+     * plug some encoding conversion routines.
+     */
+    GROW
+    start[0] = RAW;
+    start[1] = NXT(1);
+    start[2] = NXT(2);
+    start[3] = NXT(3);
+    enc = xmlDetectCharEncoding(start, 4);
+    if (enc != XML_CHAR_ENCODING_NONE) {
+        xmlSwitchEncoding(ctxt, enc);
+    }
+
     /*
      * Parse a possible text declaration first
      */
-    GROW;
     if ((RAW == '<') && (NXT(1) == '?') &&
 	(NXT(2) == 'x') && (NXT(3) == 'm') &&
 	(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
@@ -8946,6 +8999,8 @@
     xmlDocPtr newDoc;
     xmlSAXHandlerPtr oldsax = NULL;
     int ret = 0;
+    xmlChar start[4];
+    xmlCharEncoding enc;
 
     if (depth > 40) {
 	return(XML_ERR_ENTITY_LOOP);
@@ -9015,10 +9070,24 @@
 	newDoc->children->doc = doc;
     }
 
+    /* 
+     * Get the 4 first bytes and decode the charset
+     * if enc != XML_CHAR_ENCODING_NONE
+     * plug some encoding conversion routines.
+     */
+    GROW;
+    start[0] = RAW;
+    start[1] = NXT(1);
+    start[2] = NXT(2);
+    start[3] = NXT(3);
+    enc = xmlDetectCharEncoding(start, 4);
+    if (enc != XML_CHAR_ENCODING_NONE) {
+        xmlSwitchEncoding(ctxt, enc);
+    }
+
     /*
      * Parse a possible text declaration first
      */
-    GROW;
     if ((RAW == '<') && (NXT(1) == '?') &&
 	(NXT(2) == 'x') && (NXT(3) == 'm') &&
 	(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {

diff --git a/parserInternals.c b/parserInternals.c
index 4039c99..90b4812 100644
--- a/parserInternals.c
+++ b/parserInternals.c

@@ -1569,6 +1569,17 @@
 	case XML_CHAR_ENCODING_UTF8:
 	    /* default encoding, no conversion should be needed */
 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
+
+	    /*
+	     * Errata on XML-1.0 June 20 2001
+	     * Specific handling of the Byte Order Mark for
+	     * UTF-8
+	     */
+	    if ((ctxt->input->cur[0] == 0xEF) &&
+		(ctxt->input->cur[1] == 0xBB) &&
+		(ctxt->input->cur[2] == 0xBF)) {
+		ctxt->input->cur += 3;
+	    }
 	    return(0);
 	default:
 	    break;
@@ -1739,6 +1750,18 @@
 		        (ctxt->input->cur[1] == 0xFF)) {
 			ctxt->input->cur += 2;
 		    }
+		    /*
+		     * Errata on XML-1.0 June 20 2001
+		     * Specific handling of the Byte Order Mark for
+		     * UTF-8
+		     */
+		    if ((handler->name != NULL) &&
+			(!strcmp(handler->name, "UTF-8")) &&
+			(ctxt->input->cur[0] == 0xEF) &&
+			(ctxt->input->cur[1] == 0xBB) &&
+			(ctxt->input->cur[1] == 0xBF)) {
+			ctxt->input->cur += 3;
+		    }
 
 		    /*
 		     * Shring the current input buffer.
commit	87a764ed851c56eb6cf805c80ec96972224e6834	[log] [tgz]
author	Daniel Veillard <veillard@src.gnome.org>	Wed Jun 20 17:41:10 2001 +0000
committer	Daniel Veillard <veillard@src.gnome.org>	Wed Jun 20 17:41:10 2001 +0000
tree	0a4deea0679dff5c345129b1807150f6e5d675af
parent	10ea86cba4b8bc812223e526cf2bc171464e494d [diff]