- parser.c parserInternals.c encoding.c: Since Notepad on Win2k
outputs a BOM in UTF8, an errata has been issued to avoid the
problem, that was the most reasonable solution... Add support
for a leading UTF8 BOM in entities.
Daniel
diff --git a/parserInternals.c b/parserInternals.c
index 4039c99..90b4812 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1569,6 +1569,17 @@
case XML_CHAR_ENCODING_UTF8:
/* default encoding, no conversion should be needed */
ctxt->charset = XML_CHAR_ENCODING_UTF8;
+
+ /*
+ * Errata on XML-1.0 June 20 2001
+ * Specific handling of the Byte Order Mark for
+ * UTF-8
+ */
+ if ((ctxt->input->cur[0] == 0xEF) &&
+ (ctxt->input->cur[1] == 0xBB) &&
+ (ctxt->input->cur[2] == 0xBF)) {
+ ctxt->input->cur += 3;
+ }
return(0);
default:
break;
@@ -1739,6 +1750,18 @@
(ctxt->input->cur[1] == 0xFF)) {
ctxt->input->cur += 2;
}
+ /*
+ * Errata on XML-1.0 June 20 2001
+ * Specific handling of the Byte Order Mark for
+ * UTF-8
+ */
+ if ((handler->name != NULL) &&
+ (!strcmp(handler->name, "UTF-8")) &&
+ (ctxt->input->cur[0] == 0xEF) &&
+ (ctxt->input->cur[1] == 0xBB) &&
+ (ctxt->input->cur[1] == 0xBF)) {
+ ctxt->input->cur += 3;
+ }
/*
* Shring the current input buffer.