revamped the encoding support, added iconv support, so now libxml if
* encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped
the encoding support, added iconv support, so now libxml if
compiled with iconv automatically support japanese encodings
among others. Work based on initial patch from Yuan-Chen Cheng
I may have broken binary compat in the encoding handler
registration scheme, but that was so utterly broken I don't
expect anybody to have used this feature until now.
* parserInternals.h: fixup on the CHAR range macro
* xml-error.h, parser.c: catch URL/URI errors using the uri.c
code.
* tree.[ch]: added xmlBufferGrow(), was needed for iconv
* uri.c: added xmlParseURI() I can't believe I forgot to
implement this one in 2.0 !!!
* SAX.c: moved doc->encoding update in the endDocument() call.
* TODO: updated.
Iconv rules :-)
Daniel
diff --git a/ChangeLog b/ChangeLog
index 5ccb9e5..5361b29 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+Wed May 3 14:21:25 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
+
+ * encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped
+ the encoding support, added iconv support, so now libxml if
+ compiled with iconv automatically support japanese encodings
+ among others. Work based on initial patch from Yuan-Chen Cheng
+ I may have broken binary compat in the encoding handler
+ registration scheme, but that was so utterly broken I don't
+ expect anybody to have used this feature until now.
+ * parserInternals.h: fixup on the CHAR range macro
+ * xml-error.h, parser.c: catch URL/URI errors using the uri.c
+ code.
+ * tree.[ch]: added xmlBufferGrow(), was needed for iconv
+ * uri.c: added xmlParseURI() I can't believe I forgot to
+ implement this one in 2.0 !!!
+ * SAX.c: moved doc->encoding update in the endDocument() call.
+ * TODO: updated.
+
Mon Apr 24 13:30:13 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
* tree.h: removed extraneous xmlRemoveProp definition
diff --git a/SAX.c b/SAX.c
index 5293df7..dace305 100644
--- a/SAX.c
+++ b/SAX.c
@@ -595,6 +595,15 @@
if (ctxt->validate && ctxt->wellFormed &&
ctxt->myDoc && ctxt->myDoc->intSubset)
ctxt->valid &= xmlValidateDocumentFinal(&ctxt->vctxt, ctxt->myDoc);
+
+ /*
+ * Grab the encoding if it was added on-the-fly
+ */
+ if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) &&
+ (ctxt->myDoc->encoding == NULL)) {
+ ctxt->myDoc->encoding = ctxt->encoding;
+ ctxt->encoding = NULL;
+ }
}
/**
diff --git a/TODO b/TODO
index 51ea18b..2b4ae5e 100644
--- a/TODO
+++ b/TODO
@@ -6,6 +6,8 @@
TODO:
=====
+- xmlSwitchToEncoding() need a rewrite for correct handling of conversion
+ error code conditions.
- DOM needs
xmlAttrPtr xmlNewDocProp(xmlDocPtr doc, const xmlChar *name, const xmlChar *value)
int xmlPruneProp(xmlNodePtr node, xmlAtttrPtr attr);
@@ -14,7 +16,6 @@
- add support for the trick from Henry conf/sun/valid/empty.xml
- Correct standalone checking/emitting (hard)
2.9 Standalone Document Declaration
-- URI checkings (no fragments) rfc2396.txt
- Better checking of external parsed entities TAG 1234
- Find way of representing PERefs in the Dtd so that %entity; can
be saved back.
@@ -22,6 +23,7 @@
http://www.w3.org/XML/xml-19980210-errata ... bummmer
- Handle undefined namespaces in entity contents better ... at least
issue a warning
+- Issue warning when using non-absolute namespaces URI.
- General checking of DTD validation in presence of namespaces ... hairy
- fix --disable-corba configure switch handling, and use XML_WITHOUT_CORBA
not WITHOUT_CORBA flag
@@ -30,7 +32,7 @@
=====
- Get OASIS testsuite to a more friendly result, check all the results
- once stable.
+ once stable. Current state at:
http://xmlsoft.org/conf/result.html
- Optimization of tag strings allocation ?
@@ -55,11 +57,13 @@
- Add Xlink recognition/API
=> started adding an xlink.[ch] with a unified API for XML and HTML.
+ it's crap :-(
- Implement XSLT
=> seems that someone volunteered ?!?
- Implement XSchemas
+ => Really need to be done <grin/>
- O2K parsing;
=> this is a somewhat ugly mix of HTML and XML, adding a specific
@@ -88,6 +92,7 @@
Done:
=====
+- URI checkings (no fragments) rfc2396.txt
- Added a clean mechanism for overload or added input methods:
xmlRegisterInputCallbacks()
- dynamically adapt the alloc entry point to use g_alloc()/g_free()
diff --git a/configure.in b/configure.in
index baea933..3ef84fb 100644
--- a/configure.in
+++ b/configure.in
@@ -4,7 +4,7 @@
AM_CONFIG_HEADER(config.h)
LIBXML_MAJOR_VERSION=2
-LIBXML_MINOR_VERSION=0
+LIBXML_MINOR_VERSION=1
LIBXML_MICRO_VERSION=0
LIBXML_VERSION=$LIBXML_MAJOR_VERSION.$LIBXML_MINOR_VERSION.$LIBXML_MICRO_VERSION
LIBXML_VERSION_INFO=`expr $LIBXML_MAJOR_VERSION + $LIBXML_MINOR_VERSION`:$LIBXML_MICRO_VERSION:$LIBXML_MINOR_VERSION
@@ -203,6 +203,20 @@
AC_SUBST(WITH_XPATH)
AC_SUBST(XPATH_OBJ)
+AC_ARG_WITH(iconv, [ --with-iconv Add the ICONV support (on)])
+if test "$with_iconv" = "no" ; then
+ echo Disabling ICONV support
+ WITH_ICONV=0
+else
+ if test "$have_iconv" != "" ; then
+ echo Iconv support not found
+ WITH_ICONV=0
+ else
+ WITH_ICONV=1
+ fi
+fi
+AC_SUBST(WITH_ICONV)
+
AC_ARG_WITH(debug, [ --with-debug Add the debugging module (on)])
if test "$with_debug" = "no" ; then
echo Disabling DEBUG support
diff --git a/encoding.c b/encoding.c
index 4200929..1a4c157 100644
--- a/encoding.c
+++ b/encoding.c
@@ -34,12 +34,26 @@
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
+#include <libxml/xmlversion.h>
+#ifdef LIBXML_ICONV_ENABLED
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#endif
#include <libxml/encoding.h>
#include <libxml/xmlmemory.h>
xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
+#ifdef LIBXML_ICONV_ENABLED
+#if 0
+#define DEBUG_ENCODING /* Define this to get encoding traces */
+#endif
+#endif
+
+static int xmlLittleEndian = 1;
+
/*
* From rfc2044: encoding of the Unicode values on UTF-8:
*
@@ -104,30 +118,38 @@
*
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
* block of chars out.
- * Returns the number of byte written, or -1 by lack of space.
+ * Returns 0 if success, or -1 otherwise
+ * The value of @inlen after return is the number of octets consumed
+ * as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
*/
int
-isolat1ToUTF8(unsigned char* out, int outlen,
+isolat1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
- unsigned char* outstart= out;
- unsigned char* outend= out+outlen;
- const unsigned char* inend= in+*inlen;
+ unsigned char* outstart = out;
+ const unsigned char* processed = in;
+ unsigned char* outend = out + *outlen;
+ const unsigned char* inend = in + *inlen;
unsigned char c;
while (in < inend) {
c= *in++;
if (c < 0x80) {
- if (out >= outend) return(-1);
+ if (out >= outend)
+ break;
*out++ = c;
}
else {
- if (out >= outend) return(-1);
+ if (out + 1 >= outend) break;
*out++ = 0xC0 | (c >> 6);
- if (out >= outend) return(-1);
*out++ = 0x80 | (0x3F & c);
}
+ processed = in;
}
- return(out-outstart);
+ *outlen = out - outstart;
+ *inlen = processed - in;
+
+ return(0);
}
/**
@@ -141,18 +163,18 @@
* block of chars out.
* TODO: UTF8Toisolat1 need a fallback mechanism ...
*
- * Returns the number of byte written, or -1 by lack of space, or -2
- * if the transcoding fails (for *in is not valid utf8 string or
- * the result of transformation can't fit into the encoding we want)
+ * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
*/
int
-UTF8Toisolat1(unsigned char* out, int outlen,
+UTF8Toisolat1(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
- unsigned char* outstart= out;
- unsigned char* outend= out+outlen;
- const unsigned char* inend= in+*inlen;
+ unsigned char* outstart = out;
+ const unsigned char* processed = in;
+ unsigned char* outend = out + *outlen;
+ const unsigned char* inend = in + *inlen;
unsigned char c;
while (in < inend) {
@@ -162,18 +184,22 @@
*out++= c;
}
else if (in == inend) {
- *inlen -= 1;
break;
}
else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
/* a two byte utf-8 and can be encoding as isolate1 */
*out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
}
- else
+ else {
+ *outlen = out - outstart;
+ *inlen = processed - in;
return(-2);
- /* TODO : some should be represent as "&#x____;" */
+ }
+ processed = in;
}
- return(out-outstart);
+ *outlen = out - outstart;
+ *inlen = processed - in;
+ return(0);
}
/**
@@ -194,11 +220,12 @@
* as the return value is positive, else unpredictiable.
*/
int
-UTF16LEToUTF8(unsigned char* out, int outlen,
+UTF16LEToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb)
{
- unsigned char* outstart= out;
- unsigned char* outend= out+outlen;
+ unsigned char* outstart = out;
+ const unsigned char* processed = inb;
+ unsigned char* outend = out + *outlen;
unsigned short* in = (unsigned short*) inb;
unsigned short* inend;
unsigned int c, d, inlen;
@@ -208,42 +235,44 @@
if ((*inlenb % 2) == 1)
(*inlenb)--;
inlen = *inlenb / 2;
- inend= in + inlen;
+ inend = in + inlen;
while (in < inend) {
-#ifdef BIG_ENDIAN
- tmp = (unsigned char *) in;
- c = *tmp++;
- c = c | (((unsigned int)*tmp) << 8);
- in++;
-#else /* BIG_ENDIAN */
- c= *in++;
-#endif /* BIG_ENDIAN */
+ if (xmlLittleEndian) {
+ c= *in++;
+ } else {
+ tmp = (unsigned char *) in;
+ c = *tmp++;
+ c = c | (((unsigned int)*tmp) << 8);
+ in++;
+ }
if ((c & 0xFC00) == 0xD800) { /* surrogates */
if (in >= inend) { /* (in > inend) shouldn't happens */
- (*inlenb) -= 2;
break;
}
-#ifdef BIG_ENDIAN
- tmp = (unsigned char *) in;
- d = *tmp++;
- d = d | (((unsigned int)*tmp) << 8);
- in++;
-#else /* BIG_ENDIAN */
- d = *in++;
-#endif /* BIG_ENDIAN */
+ if (xmlLittleEndian) {
+ d = *in++;
+ } else {
+ tmp = (unsigned char *) in;
+ d = *tmp++;
+ d = d | (((unsigned int)*tmp) << 8);
+ in++;
+ }
if ((d & 0xFC00) == 0xDC00) {
c &= 0x03FF;
c <<= 10;
c |= d & 0x03FF;
c += 0x10000;
}
- else
+ else {
+ *outlen = out - outstart;
+ *inlenb = processed - inb;
return(-2);
+ }
}
/* assertion: c is a single UTF-4 value */
if (out >= outend)
- return(-1);
+ break;
if (c < 0x80) { *out++= c; bits= -6; }
else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
@@ -251,11 +280,14 @@
for ( ; bits >= 0; bits-= 6) {
if (out >= outend)
- return(-1);
+ break;
*out++= ((c >> bits) & 0x3F) | 0x80;
}
+ processed = (const unsigned char*) in;
}
- return(out-outstart);
+ *outlen = out - outstart;
+ *inlenb = processed - inb;
+ return(0);
}
/**
@@ -273,40 +305,44 @@
* if the transcoding failed.
*/
int
-UTF8ToUTF16LE(unsigned char* outb, int outlen,
+UTF8ToUTF16LE(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen)
{
unsigned short* out = (unsigned short*) outb;
+ const unsigned char* processed = in;
unsigned short* outstart= out;
unsigned short* outend;
const unsigned char* inend= in+*inlen;
unsigned int c, d, trailing;
-#ifdef BIG_ENDIAN
unsigned char *tmp;
unsigned short tmp1, tmp2;
-#endif /* BIG_ENDIAN */
- outlen /= 2; /* convert in short length */
- outend = out + outlen;
+ outend = out + (*outlen / 2);
while (in < inend) {
d= *in++;
if (d < 0x80) { c= d; trailing= 0; }
- else if (d < 0xC0)
- return(-2); /* trailing byte in leading position */
- else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
+ else if (d < 0xC0) {
+ /* trailing byte in leading position */
+ *outlen = out - outstart;
+ *inlen = processed - in;
+ return(-2);
+ } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
- else
- return(-2); /* no chance for this in UTF-16 */
+ else {
+ /* no chance for this in UTF-16 */
+ *outlen = out - outstart;
+ *inlen = processed - in;
+ return(-2);
+ }
if (inend - in < trailing) {
- *inlen -= (inend - in);
break;
}
for ( ; trailing; trailing--) {
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
- return(-1);
+ break;
c <<= 6;
c |= d & 0x3F;
}
@@ -314,41 +350,44 @@
/* assertion: c is a single UTF-4 value */
if (c < 0x10000) {
if (out >= outend)
- return(-1);
-#ifdef BIG_ENDIAN
- tmp = (unsigned char *) out;
- *tmp = c ;
- *(tmp + 1) = c >> 8 ;
- out++;
-#else /* BIG_ENDIAN */
- *out++ = c;
-#endif /* BIG_ENDIAN */
+ break;
+ if (xmlLittleEndian) {
+ *out++ = c;
+ } else {
+ tmp = (unsigned char *) out;
+ *tmp = c ;
+ *(tmp + 1) = c >> 8 ;
+ out++;
+ }
}
else if (c < 0x110000) {
if (out+1 >= outend)
- return(-1);
+ break;
c -= 0x10000;
-#ifdef BIG_ENDIAN
- tmp1 = 0xD800 | (c >> 10);
- tmp = (unsigned char *) out;
- *tmp = tmp1;
- *(tmp + 1) = tmp1 >> 8;
- out++;
+ if (xmlLittleEndian) {
+ *out++ = 0xD800 | (c >> 10);
+ *out++ = 0xDC00 | (c & 0x03FF);
+ } else {
+ tmp1 = 0xD800 | (c >> 10);
+ tmp = (unsigned char *) out;
+ *tmp = tmp1;
+ *(tmp + 1) = tmp1 >> 8;
+ out++;
- tmp2 = 0xDC00 | (c & 0x03FF);
- tmp = (unsigned char *) out;
- *tmp = tmp2;
- *(tmp + 1) = tmp2 >> 8;
- out++;
-#else /* BIG_ENDIAN */
- *out++ = 0xD800 | (c >> 10);
- *out++ = 0xDC00 | (c & 0x03FF);
-#endif /* BIG_ENDIAN */
+ tmp2 = 0xDC00 | (c & 0x03FF);
+ tmp = (unsigned char *) out;
+ *tmp = tmp2;
+ *(tmp + 1) = tmp2 >> 8;
+ out++;
+ }
}
else
- return(-1);
+ break;
+ processed = in;
}
- return(out-outstart);
+ *outlen = out - outstart;
+ *inlen = processed - in;
+ return(0);
}
/**
@@ -369,18 +408,16 @@
* as the return value is positive, else unpredictiable.
*/
int
-UTF16BEToUTF8(unsigned char* out, int outlen,
+UTF16BEToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb)
{
- unsigned char* outstart= out;
- unsigned char* outend= out+outlen;
+ unsigned char* outstart = out;
+ const unsigned char* processed = inb;
+ unsigned char* outend = out + *outlen;
unsigned short* in = (unsigned short*) inb;
unsigned short* inend;
unsigned int c, d, inlen;
-#ifdef BIG_ENDIAN
-#else /* BIG_ENDIAN */
unsigned char *tmp;
-#endif /* BIG_ENDIAN */
int bits;
if ((*inlenb % 2) == 1)
@@ -388,43 +425,46 @@
inlen = *inlenb / 2;
inend= in + inlen;
while (in < inend) {
-#ifdef BIG_ENDIAN
- c= *in++;
-#else
- tmp = (unsigned char *) in;
- c = *tmp++;
- c = c << 8;
- c = c | (unsigned int) *tmp;
- in++;
-#endif
+ if (xmlLittleEndian) {
+ tmp = (unsigned char *) in;
+ c = *tmp++;
+ c = c << 8;
+ c = c | (unsigned int) *tmp;
+ in++;
+ } else {
+ c= *in++;
+ }
if ((c & 0xFC00) == 0xD800) { /* surrogates */
if (in >= inend) { /* (in > inend) shouldn't happens */
- (*inlenb) -= 2;
- break;
+ *outlen = out - outstart;
+ *inlenb = processed - inb;
+ return(-2);
}
-
-#ifdef BIG_ENDIAN
- d= *in++;
-#else
- tmp = (unsigned char *) in;
- d = *tmp++;
- d = d << 8;
- d = d | (unsigned int) *tmp;
- in++;
-#endif
+ if (xmlLittleEndian) {
+ tmp = (unsigned char *) in;
+ d = *tmp++;
+ d = d << 8;
+ d = d | (unsigned int) *tmp;
+ in++;
+ } else {
+ d= *in++;
+ }
if ((d & 0xFC00) == 0xDC00) {
c &= 0x03FF;
c <<= 10;
c |= d & 0x03FF;
c += 0x10000;
}
- else
+ else {
+ *outlen = out - outstart;
+ *inlenb = processed - inb;
return(-2);
+ }
}
/* assertion: c is a single UTF-4 value */
if (out >= outend)
- return(-1);
+ break;
if (c < 0x80) { *out++= c; bits= -6; }
else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
@@ -432,11 +472,14 @@
for ( ; bits >= 0; bits-= 6) {
if (out >= outend)
- return(-1);
+ break;
*out++= ((c >> bits) & 0x3F) | 0x80;
}
+ processed = (const unsigned char*) in;
}
- return(out-outstart);
+ *outlen = out - outstart;
+ *inlenb = processed - inb;
+ return(0);
}
/**
@@ -454,79 +497,86 @@
* if the transcoding failed.
*/
int
-UTF8ToUTF16BE(unsigned char* outb, int outlen,
+UTF8ToUTF16BE(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen)
{
unsigned short* out = (unsigned short*) outb;
+ const unsigned char* processed = in;
unsigned short* outstart= out;
unsigned short* outend;
const unsigned char* inend= in+*inlen;
unsigned int c, d, trailing;
-#ifdef BIG_ENDIAN
-#else
unsigned char *tmp;
unsigned short tmp1, tmp2;
-#endif /* BIG_ENDIAN */
- outlen /= 2; /* convert in short length */
- outend = out + outlen;
+ outend = out + (*outlen / 2);
while (in < inend) {
d= *in++;
if (d < 0x80) { c= d; trailing= 0; }
- else if (d < 0xC0)
- return(-2); /* trailing byte in leading position */
- else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
+ else if (d < 0xC0) {
+ /* trailing byte in leading position */
+ *outlen = out - outstart;
+ *inlen = processed - in;
+ return(-2);
+ } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
- else
- return(-2); /* no chance for this in UTF-16 */
+ else {
+ /* no chance for this in UTF-16 */
+ *outlen = out - outstart;
+ *inlen = processed - in;
+ return(-2);
+ }
if (inend - in < trailing) {
- *inlen -= (inend - in);
break;
}
for ( ; trailing; trailing--) {
- if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return(-1);
+ if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c < 0x10000) {
- if (out >= outend) return(-1);
-#ifdef BIG_ENDIAN
- *out++ = c;
-#else
- tmp = (unsigned char *) out;
- *tmp = c >> 8;
- *(tmp + 1) = c;
- out++;
-#endif /* BIG_ENDIAN */
+ if (out >= outend) break;
+ if (xmlLittleEndian) {
+ tmp = (unsigned char *) out;
+ *tmp = c >> 8;
+ *(tmp + 1) = c;
+ out++;
+ } else {
+ *out++ = c;
+ }
}
else if (c < 0x110000) {
- if (out+1 >= outend) return(-1);
+ if (out+1 >= outend) break;
c -= 0x10000;
-#ifdef BIG_ENDIAN
- *out++ = 0xD800 | (c >> 10);
- *out++ = 0xDC00 | (c & 0x03FF);
-#else
- tmp1 = 0xD800 | (c >> 10);
- tmp = (unsigned char *) out;
- *tmp = tmp1 >> 8;
- *(tmp + 1) = tmp1;
- out++;
+ if (xmlLittleEndian) {
+ tmp1 = 0xD800 | (c >> 10);
+ tmp = (unsigned char *) out;
+ *tmp = tmp1 >> 8;
+ *(tmp + 1) = tmp1;
+ out++;
- tmp2 = 0xDC00 | (c & 0x03FF);
- tmp = (unsigned char *) out;
- *tmp = tmp2 >> 8;
- *(tmp + 1) = tmp2;
- out++;
-#endif
+ tmp2 = 0xDC00 | (c & 0x03FF);
+ tmp = (unsigned char *) out;
+ *tmp = tmp2 >> 8;
+ *(tmp + 1) = tmp2;
+ out++;
+ } else {
+ *out++ = 0xD800 | (c >> 10);
+ *out++ = 0xDC00 | (c & 0x03FF);
+ }
}
- else return(-1);
+ else
+ break;
+ processed = in;
}
- return(out-outstart);
+ *outlen = out - outstart;
+ *inlen = processed - in;
+ return(0);
}
/**
@@ -636,8 +686,12 @@
if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
- if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
+ if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
+
+#ifdef DEBUG_ENCODING
+ fprintf(stderr, "Unknown encoding %s\n", name);
+#endif
return(XML_CHAR_ENCODING_ERROR);
}
@@ -712,6 +766,9 @@
* registers and returns the handler.
*/
xmlRegisterCharEncodingHandler(handler);
+#ifdef DEBUG_ENCODING
+ fprintf(stderr, "Registered encoding handler for %s\n", name);
+#endif
return(handler);
}
@@ -725,11 +782,18 @@
*/
void
xmlInitCharEncodingHandlers(void) {
+ unsigned short int tst = 0x1234;
+ unsigned char *ptr = (unsigned char *) &tst;
+
if (handlers != NULL) return;
handlers = (xmlCharEncodingHandlerPtr *)
xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
+ if (*ptr == 0x12) xmlLittleEndian = 0;
+ else if (*ptr == 0x34) xmlLittleEndian = 1;
+ else fprintf(stderr, "Odd problem at endianness detection\n");
+
if (handlers == NULL) {
fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
return;
@@ -755,7 +819,8 @@
for (;nbCharEncodingHandler > 0;) {
nbCharEncodingHandler--;
if (handlers[nbCharEncodingHandler] != NULL) {
- xmlFree(handlers[nbCharEncodingHandler]->name);
+ if (handlers[nbCharEncodingHandler]->name != NULL)
+ xmlFree(handlers[nbCharEncodingHandler]->name);
xmlFree(handlers[nbCharEncodingHandler]);
}
}
@@ -798,6 +863,8 @@
*/
xmlCharEncodingHandlerPtr
xmlGetCharEncodingHandler(xmlCharEncoding enc) {
+ xmlCharEncodingHandlerPtr handler;
+
if (handlers == NULL) xmlInitCharEncodingHandlers();
switch (enc) {
case XML_CHAR_ENCODING_ERROR:
@@ -811,40 +878,68 @@
case XML_CHAR_ENCODING_UTF16BE:
return(xmlUTF16BEHandler);
case XML_CHAR_ENCODING_EBCDIC:
- return(NULL);
+ handler = xmlFindCharEncodingHandler("EBCDIC");
+ if (handler != NULL) return(handler);
+ handler = xmlFindCharEncodingHandler("ebcdic");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_UCS4LE:
- return(NULL);
+ handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
+ if (handler != NULL) return(handler);
+ handler = xmlFindCharEncodingHandler("UCS-4");
+ if (handler != NULL) return(handler);
+ handler = xmlFindCharEncodingHandler("UCS4");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_UCS4BE:
- return(NULL);
+ handler = xmlFindCharEncodingHandler("UCS4BE");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_UCS4_2143:
- return(NULL);
+ break;
case XML_CHAR_ENCODING_UCS4_3412:
- return(NULL);
+ break;
case XML_CHAR_ENCODING_UCS2:
- return(NULL);
+ handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
+ if (handler != NULL) return(handler);
+ handler = xmlFindCharEncodingHandler("UCS-2");
+ if (handler != NULL) return(handler);
+ handler = xmlFindCharEncodingHandler("UCS2");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_8859_1:
- return(NULL);
case XML_CHAR_ENCODING_8859_2:
- return(NULL);
case XML_CHAR_ENCODING_8859_3:
- return(NULL);
case XML_CHAR_ENCODING_8859_4:
- return(NULL);
case XML_CHAR_ENCODING_8859_5:
- return(NULL);
case XML_CHAR_ENCODING_8859_6:
- return(NULL);
case XML_CHAR_ENCODING_8859_7:
- return(NULL);
case XML_CHAR_ENCODING_8859_8:
- return(NULL);
case XML_CHAR_ENCODING_8859_9:
return(NULL);
case XML_CHAR_ENCODING_2022_JP:
+ handler = xmlFindCharEncodingHandler("ISO-2022-JP");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_SHIFT_JIS:
+ handler = xmlFindCharEncodingHandler("SHIFT-JIS");
+ if (handler != NULL) return(handler);
+ handler = xmlFindCharEncodingHandler("SHIFT_JIS");
+ if (handler != NULL) return(handler);
+ handler = xmlFindCharEncodingHandler("Shift_JIS");
+ if (handler != NULL) return(handler);
+ break;
case XML_CHAR_ENCODING_EUC_JP:
- return(NULL);
+ handler = xmlFindCharEncodingHandler("EUC-JP");
+ if (handler != NULL) return(handler);
+ break;
+ default:
+ break;
}
+
+#ifdef DEBUG_ENCODING
+ fprintf(stderr, "No handler found for encoding %d\n", enc);
+#endif
return(NULL);
}
@@ -858,23 +953,306 @@
*/
xmlCharEncodingHandlerPtr
xmlFindCharEncodingHandler(const char *name) {
- char upper[500];
+#ifdef LIBXML_ICONV_ENABLED
+ iconv_t icv_in, icv_out;
+ xmlCharEncodingHandlerPtr enc;
+#endif /* LIBXML_ICONV_ENABLED */
+ char upper[100];
int i;
if (handlers == NULL) xmlInitCharEncodingHandlers();
if (name == NULL) return(xmlDefaultCharEncodingHandler);
if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
- for (i = 0;i < 499;i++) {
+ for (i = 0;i < 99;i++) {
upper[i] = toupper(name[i]);
if (upper[i] == 0) break;
}
upper[i] = 0;
for (i = 0;i < nbCharEncodingHandler; i++)
- if (!strcmp(name, handlers[i]->name))
+ if (!strcmp(upper, handlers[i]->name)) {
+#ifdef DEBUG_ENCODING
+ fprintf(stderr, "Found registered handler for encoding %s\n", name);
+#endif
return(handlers[i]);
+ }
+#ifdef LIBXML_ICONV_ENABLED
+ /* check whether iconv can handle this */
+ icv_in = iconv_open("UTF-8", name);
+ icv_out = iconv_open(name, "UTF-8");
+ if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
+ enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
+ if (enc == NULL) {
+ iconv_close(icv_in);
+ iconv_close(icv_out);
+ return(NULL);
+ }
+ enc->name = NULL;
+ enc->input = NULL;
+ enc->output = NULL;
+ enc->iconv_in = icv_in;
+ enc->iconv_out = icv_out;
+#ifdef DEBUG_ENCODING
+ fprintf(stderr, "Found iconv handler for encoding %s\n", name);
+#endif
+ return enc;
+ } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
+ fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
+ }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+ fprintf(stderr, "No handler found for encoding %s\n", name);
+#endif
return(NULL);
}
+#ifdef LIBXML_ICONV_ENABLED
+/**
+ * xmlIconvWrapper:
+ * @cd: iconv converter data structure
+ * @out: a pointer to an array of bytes to store the result
+ * @outlen: the length of @out
+ * @in: a pointer to an array of ISO Latin 1 chars
+ * @inlen: the length of @in
+ *
+ * Returns 0 if success, or
+ * -1 by lack of space, or
+ * -2 if the transcoding fails (for *in is not valid utf8 string or
+ * the result of transformation can't fit into the encoding we want), or
+ * -3 if there the last byte can't form a single output char.
+ *
+ * The value of @inlen after return is the number of octets consumed
+ * as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
+ */
+static int
+xmlIconvWrapper(iconv_t cd,
+ unsigned char *out, int *outlen,
+ const unsigned char *in, int *inlen) {
+
+ size_t icv_inlen = *inlen, icv_outlen = *outlen;
+ const char *icv_in = (const char *) in;
+ char *icv_out = (char *) out;
+ int ret;
+
+ ret = iconv(cd,
+ &icv_in, &icv_inlen,
+ &icv_out, &icv_outlen);
+ *inlen -= icv_inlen;
+ *outlen -= icv_outlen;
+ if (icv_inlen != 0 || ret == (size_t) -1) {
+#ifdef EILSEQ
+ if (errno == EILSEQ) {
+ return -2;
+ } else
+#endif
+#ifdef E2BIG
+ if (errno == E2BIG) {
+ return -1;
+ } else
+#endif
+#ifdef EINVAL
+ if (errno == EINVAL) {
+ return -3;
+ }
+#endif
+ else {
+ return -3;
+ }
+ }
+ return 0;
+}
+#endif /* LIBXML_ICONV_ENABLED */
+
+/**
+ * xmlCharEncInFunc:
+ * @handler: char enconding transformation data structure
+ * @out: an xmlBuffer for the output.
+ * @in: an xmlBuffer for the input
+ *
+ * Generic front-end for the encoding handler input function
+ *
+ * Returns the number of byte written if success, or
+ * -1 general error
+ * -2 if the transcoding fails (for *in is not valid utf8 string or
+ * the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+ xmlBufferPtr in) {
+ int ret = -2;
+ int written;
+ int toconv;
+
+ if (handler == NULL) return(-1);
+ if (out == NULL) return(-1);
+ if (in == NULL) return(-1);
+
+ written = out->size - out->use;
+ toconv = in->use;
+ if (toconv * 2 >= written) {
+ xmlBufferGrow(out, toconv * 2);
+ written = out->size - out->use - 1;
+ }
+ if (handler->input != NULL) {
+ ret = handler->input(&out->content[out->use], &written,
+ in->content, &toconv);
+ xmlBufferShrink(in, toconv);
+ out->use += written;
+ out->content[out->use] = 0;
+ }
+#ifdef LIBXML_ICONV_ENABLED
+ else if (handler->iconv_in != NULL) {
+ ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
+ &written, in->content, &toconv);
+ xmlBufferShrink(in, toconv);
+ out->use += written;
+ out->content[out->use] = 0;
+ if (ret == -1) ret = -3;
+ }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+ switch (ret) {
+ case 0:
+ fprintf(stderr, "converted %d bytes to %d bytes of input\n",
+ toconv, written);
+ break;
+ case -1:
+ fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
+ toconv, written, in->use);
+ break;
+ case -2:
+ fprintf(stderr, "input conversion failed due to input error\n");
+ break;
+ case -3:
+ fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
+ toconv, written, in->use);
+ break;
+ default:
+ fprintf(stderr,"Unknown input conversion failed %d\n", ret);
+ }
+#endif
+ /*
+ * Ignore when input buffer is not on a boundary
+ */
+ if (ret == -3) ret = 0;
+ return(ret);
+}
+
+/**
+ * xmlCharEncOutFunc:
+ * @handler: char enconding transformation data structure
+ * @out: an xmlBuffer for the output.
+ * @in: an xmlBuffer for the input
+ *
+ * Generic front-end for the encoding handler output function
+ *
+ * Returns the number of byte written if success, or
+ * -1 general error
+ * -2 if the transcoding fails (for *in is not valid utf8 string or
+ * the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+ xmlBufferPtr in) {
+ int ret = -2;
+ int written;
+ int toconv;
+
+ if (handler == NULL) return(-1);
+ if (out == NULL) return(-1);
+ if (in == NULL) return(-1);
+
+ written = out->size - out->use;
+ toconv = in->use;
+ if (toconv * 2 >= written) {
+ xmlBufferGrow(out, toconv * 2);
+ written = out->size - out->use - 1;
+ }
+ if (handler->output != NULL) {
+ ret = handler->output(&out->content[out->use], &written,
+ in->content, &toconv);
+ xmlBufferShrink(in, toconv);
+ out->use += written;
+ out->content[out->use] = 0;
+ }
+#ifdef LIBXML_ICONV_ENABLED
+ else if (handler->iconv_out != NULL) {
+ ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
+ &written, in->content, &toconv);
+ xmlBufferShrink(in, toconv);
+ out->use += written;
+ out->content[out->use] = 0;
+ if (ret == -1) ret = -3;
+ }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+ switch (ret) {
+ case 0:
+ fprintf(stderr, "converted %d bytes to %d bytes of output\n",
+ toconv, written);
+ break;
+ case -1:
+ fprintf(stderr, "output conversion failed by lack of space\n");
+ break;
+ case -2:
+ fprintf(stderr, "output conversion failed due to output error\n");
+ break;
+ case -3:
+ fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
+ toconv, written, in->use);
+ break;
+ default:
+ fprintf(stderr,"Unknown output conversion failed %d\n", ret);
+ }
+#endif
+ return(ret);
+}
+
+/**
+ * xmlCharEncCloseFunc:
+ * @handler: char enconding transformation data structure
+ *
+ * Generic front-end for hencoding handler close function
+ *
+ * Returns 0 if success, or -1 in case of error
+ */
+int
+xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
+ int ret = 0;
+ if (handler == NULL) return(-1);
+ if (handler->name == NULL) return(-1);
+#ifdef LIBXML_ICONV_ENABLED
+ /*
+ * Iconv handlers can be oused only once, free the whole block.
+ * and the associated icon resources.
+ */
+ if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
+ if (handler->name != NULL)
+ xmlFree(handler->name);
+ handler->name = NULL;
+ if (handler->iconv_out != NULL) {
+ if (iconv_close(handler->iconv_out))
+ ret = -1;
+ handler->iconv_out = NULL;
+ }
+ if (handler->iconv_in != NULL) {
+ if (iconv_close(handler->iconv_in))
+ ret = -1;
+ handler->iconv_in = NULL;
+ }
+ xmlFree(handler);
+ }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+ if (ret)
+ fprintf(stderr, "failed to close the encoding handler\n");
+ else
+ fprintf(stderr, "closed the encoding handler\n");
+
+#endif
+ return(ret);
+}
+
diff --git a/encoding.h b/encoding.h
index 1b1c92e..f6edbf2 100644
--- a/encoding.h
+++ b/encoding.h
@@ -22,12 +22,30 @@
#define __XML_CHAR_ENCODING_H__
#include <libxml/xmlversion.h>
+#ifdef LIBXML_ICONV_ENABLED
+#include <iconv.h>
+#endif
+#include <libxml/tree.h>
+
#ifdef __cplusplus
extern "C" {
#endif
/**
* Predefined values for some standard encodings
+ * Libxml don't do beforehand translation on UTF8, ISOLatinX
+ * It also support UTF16 (LE and BE) by default.
+ *
+ * Anything else would have to be translated to UTF8 before being
+ * given to the parser itself. The BOM for UTF16 and the encoding
+ * declaration are looked at and a converter is looked for at that
+ * point. If not found the parser stops here as asked by the XML REC
+ * Converter can be registered by the user using xmlRegisterCharEncodingHandler
+ * but the currentl form doesn't allow stateful transcoding (a serious
+ * problem agreed !). If iconv has been found it will be used
+ * automatically and allow stateful transcoding, the simplest is then
+ * to be sure to enable icon and to provide iconv libs for the encoding
+ * support needed.
*/
typedef enum {
XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */
@@ -65,9 +83,13 @@
* Take a block of chars in the original encoding and try to convert
* it to an UTF-8 block of chars out.
*
- * Returns the number of byte written, or -1 by lack of space.
+ * Returns the number of byte written, or -1 by lack of space, or -2
+ * if the transcoding failed.
+ * The value of @inlen after return is the number of octets consumed
+ * as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
*/
-typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
+typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen);
@@ -83,12 +105,17 @@
*
* Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed.
+ * The value of @inlen after return is the number of octets consumed
+ * as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
*/
-typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen,
+typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen);
+
/*
* Block defining the handlers for non UTF-8 encodings.
+ * If iconv is supported, there is two extra fields
*/
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
@@ -96,7 +123,11 @@
struct _xmlCharEncodingHandler {
char *name;
xmlCharEncodingInputFunc input;
- xmlCharEncodingOutputFunc output;
+ xmlCharEncodingOutputFunc output;
+#ifdef LIBXML_ICONV_ENABLED
+ iconv_t iconv_in;
+ iconv_t iconv_out;
+#endif /* LIBXML_ICONV_ENABLED */
};
void xmlInitCharEncodingHandlers (void);
@@ -109,6 +140,14 @@
xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name);
int xmlCheckUTF8 (const unsigned char *utf);
+int xmlCharEncOutFunc (xmlCharEncodingHandler *handler,
+ xmlBufferPtr out,
+ xmlBufferPtr in);
+
+int xmlCharEncInFunc (xmlCharEncodingHandler *handler,
+ xmlBufferPtr out,
+ xmlBufferPtr in);
+int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler);
#ifdef __cplusplus
}
diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h
index 1b1c92e..f6edbf2 100644
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -22,12 +22,30 @@
#define __XML_CHAR_ENCODING_H__
#include <libxml/xmlversion.h>
+#ifdef LIBXML_ICONV_ENABLED
+#include <iconv.h>
+#endif
+#include <libxml/tree.h>
+
#ifdef __cplusplus
extern "C" {
#endif
/**
* Predefined values for some standard encodings
+ * Libxml don't do beforehand translation on UTF8, ISOLatinX
+ * It also support UTF16 (LE and BE) by default.
+ *
+ * Anything else would have to be translated to UTF8 before being
+ * given to the parser itself. The BOM for UTF16 and the encoding
+ * declaration are looked at and a converter is looked for at that
+ * point. If not found the parser stops here as asked by the XML REC
+ * Converter can be registered by the user using xmlRegisterCharEncodingHandler
+ * but the currentl form doesn't allow stateful transcoding (a serious
+ * problem agreed !). If iconv has been found it will be used
+ * automatically and allow stateful transcoding, the simplest is then
+ * to be sure to enable icon and to provide iconv libs for the encoding
+ * support needed.
*/
typedef enum {
XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */
@@ -65,9 +83,13 @@
* Take a block of chars in the original encoding and try to convert
* it to an UTF-8 block of chars out.
*
- * Returns the number of byte written, or -1 by lack of space.
+ * Returns the number of byte written, or -1 by lack of space, or -2
+ * if the transcoding failed.
+ * The value of @inlen after return is the number of octets consumed
+ * as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
*/
-typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
+typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen);
@@ -83,12 +105,17 @@
*
* Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed.
+ * The value of @inlen after return is the number of octets consumed
+ * as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
*/
-typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen,
+typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen);
+
/*
* Block defining the handlers for non UTF-8 encodings.
+ * If iconv is supported, there is two extra fields
*/
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
@@ -96,7 +123,11 @@
struct _xmlCharEncodingHandler {
char *name;
xmlCharEncodingInputFunc input;
- xmlCharEncodingOutputFunc output;
+ xmlCharEncodingOutputFunc output;
+#ifdef LIBXML_ICONV_ENABLED
+ iconv_t iconv_in;
+ iconv_t iconv_out;
+#endif /* LIBXML_ICONV_ENABLED */
};
void xmlInitCharEncodingHandlers (void);
@@ -109,6 +140,14 @@
xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name);
int xmlCheckUTF8 (const unsigned char *utf);
+int xmlCharEncOutFunc (xmlCharEncodingHandler *handler,
+ xmlBufferPtr out,
+ xmlBufferPtr in);
+
+int xmlCharEncInFunc (xmlCharEncodingHandler *handler,
+ xmlBufferPtr out,
+ xmlBufferPtr in);
+int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler);
#ifdef __cplusplus
}
diff --git a/include/libxml/parserInternals.h b/include/libxml/parserInternals.h
index c359734..e7e6fa0 100644
--- a/include/libxml/parserInternals.h
+++ b/include/libxml/parserInternals.h
@@ -28,10 +28,10 @@
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
*/
#define IS_CHAR(c) \
- ((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
- (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) && \
- (((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) && \
- ((c) <= 0x10FFFF))
+ (((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
+ (((c) >= 0x20) && ((c) <= 0xD7FF)) || \
+ (((c) >= 0xE000) && ((c) <= 0xFFFD)) || \
+ (((c) >= 0x10000) && ((c) <= 0x10FFFF)))
/*
* [3] S ::= (#x20 | #x9 | #xD | #xA)+
@@ -442,8 +442,10 @@
xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL,
const xmlChar *ID,
const xmlChar *base);
-void xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
+int xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
xmlCharEncoding enc);
+int xmlSwitchToEncoding (xmlParserCtxtPtr ctxt,
+ xmlCharEncodingHandlerPtr handler);
void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt);
/**
diff --git a/include/libxml/tree.h b/include/libxml/tree.h
index 1cb12e2..35ea525 100644
--- a/include/libxml/tree.h
+++ b/include/libxml/tree.h
@@ -380,6 +380,8 @@
const char *str);
int xmlBufferShrink (xmlBufferPtr buf,
int len);
+int xmlBufferGrow (xmlBufferPtr buf,
+ int len);
void xmlBufferEmpty (xmlBufferPtr buf);
const xmlChar* xmlBufferContent (const xmlBufferPtr buf);
int xmlBufferUse (const xmlBufferPtr buf);
diff --git a/include/libxml/xmlIO.h b/include/libxml/xmlIO.h
index 8f9b7e0..2d14ebe 100644
--- a/include/libxml/xmlIO.h
+++ b/include/libxml/xmlIO.h
@@ -33,6 +33,7 @@
xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */
xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */
+ xmlBufferPtr raw; /* if encoder != NULL buffer for raw input */
};
diff --git a/parser.c b/parser.c
index a8e6ff4..6714d3c 100644
--- a/parser.c
+++ b/parser.c
@@ -41,6 +41,7 @@
#include <libxml/valid.h>
#include <libxml/parserInternals.h>
#include <libxml/xmlIO.h>
+#include <libxml/uri.h>
#include "xml-error.h"
#define XML_PARSER_BIG_BUFFER_SIZE 1000
@@ -483,7 +484,7 @@
if ((ctxt->sax != NULL) &&
(ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
- "Char out of allowed range\n");
+ "Char 0x%X out of allowed range\n", val);
ctxt->errNo = XML_ERR_INVALID_ENCODING;
ctxt->wellFormed = 0;
ctxt->disableSAX = 1;
@@ -612,7 +613,7 @@
if ((ctxt->sax != NULL) &&
(ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
- "Char out of allowed range\n");
+ "Char 0x%X out of allowed range\n", val);
ctxt->errNo = XML_ERR_INVALID_ENCODING;
ctxt->wellFormed = 0;
ctxt->disableSAX = 1;
@@ -727,7 +728,7 @@
if ((ctxt->sax != NULL) &&
(ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
- "Char out of allowed range\n");
+ "Char 0x%X out of allowed range\n", val);
ctxt->errNo = XML_ERR_INVALID_ENCODING;
ctxt->wellFormed = 0;
ctxt->disableSAX = 1;
@@ -2278,96 +2279,209 @@
*
* change the input functions when discovering the character encoding
* of a given entity.
+ *
+ * Returns 0 in case of success, -1 otherwise
*/
-void
+int
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
{
xmlCharEncodingHandlerPtr handler;
+ switch (enc) {
+ case XML_CHAR_ENCODING_ERROR:
+ ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData, "encoding unknown\n");
+ ctxt->wellFormed = 0;
+ ctxt->disableSAX = 1;
+ break;
+ case XML_CHAR_ENCODING_NONE:
+ /* let's assume it's UTF-8 without the XML decl */
+ return(0);
+ case XML_CHAR_ENCODING_UTF8:
+ /* default encoding, no conversion should be needed */
+ return(0);
+ default:
+ break;
+ }
handler = xmlGetCharEncodingHandler(enc);
+ if (handler == NULL) {
+ /*
+ * Default handlers.
+ */
+ switch (enc) {
+ case XML_CHAR_ENCODING_ERROR:
+ ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData, "encoding unknown\n");
+ ctxt->wellFormed = 0;
+ ctxt->disableSAX = 1;
+ break;
+ case XML_CHAR_ENCODING_NONE:
+ /* let's assume it's UTF-8 without the XML decl */
+ return(0);
+ case XML_CHAR_ENCODING_UTF8:
+ /* default encoding, no conversion should be needed */
+ return(0);
+ case XML_CHAR_ENCODING_UTF16LE:
+ break;
+ case XML_CHAR_ENCODING_UTF16BE:
+ break;
+ case XML_CHAR_ENCODING_UCS4LE:
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "char encoding USC4 little endian not supported\n");
+ break;
+ case XML_CHAR_ENCODING_UCS4BE:
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "char encoding USC4 big endian not supported\n");
+ break;
+ case XML_CHAR_ENCODING_EBCDIC:
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "char encoding EBCDIC not supported\n");
+ break;
+ case XML_CHAR_ENCODING_UCS4_2143:
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "char encoding UCS4 2143 not supported\n");
+ break;
+ case XML_CHAR_ENCODING_UCS4_3412:
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "char encoding UCS4 3412 not supported\n");
+ break;
+ case XML_CHAR_ENCODING_UCS2:
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "char encoding UCS2 not supported\n");
+ break;
+ case XML_CHAR_ENCODING_8859_1:
+ case XML_CHAR_ENCODING_8859_2:
+ case XML_CHAR_ENCODING_8859_3:
+ case XML_CHAR_ENCODING_8859_4:
+ case XML_CHAR_ENCODING_8859_5:
+ case XML_CHAR_ENCODING_8859_6:
+ case XML_CHAR_ENCODING_8859_7:
+ case XML_CHAR_ENCODING_8859_8:
+ case XML_CHAR_ENCODING_8859_9:
+ /*
+ * Keep the internal content in the document encoding
+ */
+ if ((ctxt->inputNr == 1) &&
+ (ctxt->encoding == NULL) &&
+ (ctxt->input->encoding != NULL)) {
+ ctxt->encoding = xmlStrdup(ctxt->input->encoding);
+ }
+ return(0);
+ case XML_CHAR_ENCODING_2022_JP:
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "char encoding ISO-2022-JPnot supported\n");
+ break;
+ case XML_CHAR_ENCODING_SHIFT_JIS:
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "char encoding Shift_JIS not supported\n");
+ break;
+ case XML_CHAR_ENCODING_EUC_JP:
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "char encoding EUC-JPnot supported\n");
+ break;
+ }
+ }
+ if (handler == NULL)
+ return(-1);
+ return(xmlSwitchToEncoding(ctxt, handler));
+}
+
+/**
+ * xmlSwitchToEncoding:
+ * @ctxt: the parser context
+ * @handler: the encoding handler
+ *
+ * change the input functions when discovering the character encoding
+ * of a given entity.
+ *
+ * Returns 0 in case of success, -1 otherwise
+ */
+int
+xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
+{
+ int nbchars;
+
if (handler != NULL) {
if (ctxt->input != NULL) {
if (ctxt->input->buf != NULL) {
if (ctxt->input->buf->encoder != NULL) {
+ if (ctxt->input->buf->encoder == handler)
+ return(0);
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : encoder already regitered\n");
- return;
+ return(-1);
}
ctxt->input->buf->encoder = handler;
/*
- * Is there already some content down the pipe to convert
+ * Is there already some content down the pipe to convert ?
*/
if ((ctxt->input->buf->buffer != NULL) &&
(ctxt->input->buf->buffer->use > 0)) {
- xmlChar *buf;
- int res, len, size;
int processed;
/*
* Specific handling of the Byte Order Mark for
* UTF-16
*/
- if ((enc == XML_CHAR_ENCODING_UTF16LE) &&
+ if ((handler->name != NULL) &&
+ (!strcmp(handler->name, "UTF-16LE")) &&
(ctxt->input->cur[0] == 0xFF) &&
(ctxt->input->cur[1] == 0xFE)) {
- SKIP(2);
+ ctxt->input->cur += 2;
}
- if ((enc == XML_CHAR_ENCODING_UTF16BE) &&
+ if ((handler->name != NULL) &&
+ (!strcmp(handler->name, "UTF-16BE")) &&
(ctxt->input->cur[0] == 0xFE) &&
(ctxt->input->cur[1] == 0xFF)) {
- SKIP(2);
+ ctxt->input->cur += 2;
}
/*
- * convert the non processed part
+ * Shring the current input buffer.
+ * Move it as the raw buffer and create a new input buffer
*/
processed = ctxt->input->cur - ctxt->input->base;
- len = ctxt->input->buf->buffer->use - processed;
-
- if (len <= 0) {
- return;
- }
- size = ctxt->input->buf->buffer->use * 4;
- if (size < 4000)
- size = 4000;
-retry_larger:
- buf = (xmlChar *) xmlMalloc(size + 1);
- if (buf == NULL) {
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "xmlSwitchEncoding : out of memory\n");
- return;
- }
- /* TODO !!! Handling of buf too small */
- res = handler->input(buf, size, ctxt->input->cur, &len);
- if (res == -1) {
- size *= 2;
- xmlFree(buf);
- goto retry_larger;
- }
- if ((res < 0) ||
- (len != ctxt->input->buf->buffer->use - processed)) {
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "xmlSwitchEncoding : conversion failed\n");
- xmlFree(buf);
- return;
- }
+ xmlBufferShrink(ctxt->input->buf->buffer, processed);
+ ctxt->input->buf->raw = ctxt->input->buf->buffer;
+ ctxt->input->buf->buffer = xmlBufferCreate();
/*
- * Conversion succeeded, get rid of the old buffer
+ * convert as much as possible of the raw input
+ * to the parser reading buffer.
*/
- xmlFree(ctxt->input->buf->buffer->content);
- ctxt->input->buf->buffer->content = buf;
- ctxt->input->base = buf;
- ctxt->input->cur = buf;
- ctxt->input->buf->buffer->size = size;
- ctxt->input->buf->buffer->use = res;
- buf[res] = 0;
+ nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
+ ctxt->input->buf->buffer,
+ ctxt->input->buf->raw);
+ if (nbchars < 0) {
+ fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
+ return(-1);
+ }
+ ctxt->input->base =
+ ctxt->input->cur = ctxt->input->buf->buffer->content;
}
- return;
+ return(0);
} else {
if (ctxt->input->length == 0) {
/*
@@ -2377,191 +2491,59 @@
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : no input\n");
- return;
+ return(-1);
} else {
- xmlChar *buf;
- int res, len;
- int processed = ctxt->input->cur - ctxt->input->base;
+ int processed;
/*
- * convert the non processed part
+ * Shring the current input buffer.
+ * Move it as the raw buffer and create a new input buffer
*/
- len = ctxt->input->length - processed;
- if (len <= 0) {
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "xmlSwitchEncoding : input fully consumed?\n");
- return;
+ processed = ctxt->input->cur - ctxt->input->base;
+ ctxt->input->buf->raw = xmlBufferCreate();
+ xmlBufferAdd(ctxt->input->buf->raw, ctxt->input->cur,
+ ctxt->input->length - processed);
+ ctxt->input->buf->buffer = xmlBufferCreate();
+
+ /*
+ * convert as much as possible of the raw input
+ * to the parser reading buffer.
+ */
+ nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
+ ctxt->input->buf->buffer,
+ ctxt->input->buf->raw);
+ if (nbchars < 0) {
+ fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
+ return(-1);
}
- buf = (xmlChar *) xmlMalloc(ctxt->input->length * 4);
- if (buf == NULL) {
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "xmlSwitchEncoding : out of memory\n");
- return;
- }
- res = handler->input(buf, ctxt->input->length * 4,
- ctxt->input->cur, &len);
- if ((res < 0) ||
- (len != ctxt->input->length - processed)) {
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "xmlSwitchEncoding : conversion failed\n");
- xmlFree(buf);
- return;
- }
+
/*
* Conversion succeeded, get rid of the old buffer
*/
if ((ctxt->input->free != NULL) &&
(ctxt->input->base != NULL))
ctxt->input->free((xmlChar *) ctxt->input->base);
- ctxt->input->base = ctxt->input->cur = buf;
- ctxt->input->length = res;
+ ctxt->input->base =
+ ctxt->input->cur = ctxt->input->buf->buffer->content;
}
}
} else {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : no input\n");
+ return(-1);
}
- }
+ /*
+ * The parsing is now done in UTF8 natively
+ */
+ if (ctxt->encoding != NULL) {
+ xmlFree((xmlChar *) ctxt->encoding);
+ ctxt->encoding = NULL;
+ }
+ } else
+ return(-1);
+ return(0);
- switch (enc) {
- case XML_CHAR_ENCODING_ERROR:
- ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData, "encoding unknown\n");
- ctxt->wellFormed = 0;
- ctxt->disableSAX = 1;
- break;
- case XML_CHAR_ENCODING_NONE:
- /* let's assume it's UTF-8 without the XML decl */
- return;
- case XML_CHAR_ENCODING_UTF8:
- /* default encoding, no conversion should be needed */
- return;
- case XML_CHAR_ENCODING_UTF16LE:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding UTF16 little endian not supported\n");
- break;
- case XML_CHAR_ENCODING_UTF16BE:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding UTF16 big endian not supported\n");
- break;
- case XML_CHAR_ENCODING_UCS4LE:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding USC4 little endian not supported\n");
- break;
- case XML_CHAR_ENCODING_UCS4BE:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding USC4 big endian not supported\n");
- break;
- case XML_CHAR_ENCODING_EBCDIC:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding EBCDIC not supported\n");
- break;
- case XML_CHAR_ENCODING_UCS4_2143:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding UCS4 2143 not supported\n");
- break;
- case XML_CHAR_ENCODING_UCS4_3412:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding UCS4 3412 not supported\n");
- break;
- case XML_CHAR_ENCODING_UCS2:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding UCS2 not supported\n");
- break;
- case XML_CHAR_ENCODING_8859_1:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
- break;
- case XML_CHAR_ENCODING_8859_2:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
- break;
- case XML_CHAR_ENCODING_8859_3:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO_8859_3 not supported\n");
- break;
- case XML_CHAR_ENCODING_8859_4:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO_8859_4 not supported\n");
- break;
- case XML_CHAR_ENCODING_8859_5:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO_8859_5 not supported\n");
- break;
- case XML_CHAR_ENCODING_8859_6:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO_8859_6 not supported\n");
- break;
- case XML_CHAR_ENCODING_8859_7:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO_8859_7 not supported\n");
- break;
- case XML_CHAR_ENCODING_8859_8:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO_8859_8 not supported\n");
- break;
- case XML_CHAR_ENCODING_8859_9:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO_8859_9 not supported\n");
- break;
- case XML_CHAR_ENCODING_2022_JP:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding ISO-2022-JPnot supported\n");
- break;
- case XML_CHAR_ENCODING_SHIFT_JIS:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding Shift_JISnot supported\n");
- break;
- case XML_CHAR_ENCODING_EUC_JP:
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "char encoding EUC-JPnot supported\n");
- break;
- }
}
/************************************************************************
@@ -4253,7 +4235,7 @@
void
xmlParseComment(xmlParserCtxtPtr ctxt) {
xmlChar *buf = NULL;
- int len = 0;
+ int len;
int size = XML_PARSER_BUFFER_SIZE;
int q, ql;
int r, rl;
@@ -4282,10 +4264,11 @@
r = CUR_CHAR(rl);
NEXTL(rl);
cur = CUR_CHAR(l);
+ len = 0;
while (IS_CHAR(cur) &&
((cur != '>') ||
(r != '-') || (q != '-'))) {
- if ((r == '-') && (q == '-')) {
+ if ((r == '-') && (q == '-') && (len > 1)) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Comment must not contain '--' (double-hyphen)`\n");
@@ -4732,11 +4715,36 @@
ctxt->disableSAX = 1;
}
if (URI) {
- if ((ctxt->sax != NULL) &&
- (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
- ctxt->sax->entityDecl(ctxt->userData, name,
- XML_EXTERNAL_PARAMETER_ENTITY,
- literal, URI, NULL);
+ xmlURIPtr uri;
+
+ uri = xmlParseURI((const char *) URI);
+ if (uri == NULL) {
+ if ((ctxt->sax != NULL) &&
+ (!ctxt->disableSAX) &&
+ (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "Invalid URI: %s\n", URI);
+ ctxt->wellFormed = 0;
+ ctxt->errNo = XML_ERR_INVALID_URI;
+ } else {
+ if (uri->fragment != NULL) {
+ if ((ctxt->sax != NULL) &&
+ (!ctxt->disableSAX) &&
+ (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "Fragment not allowed: %s\n", URI);
+ ctxt->wellFormed = 0;
+ ctxt->errNo = XML_ERR_URI_FRAGMENT;
+ } else {
+ if ((ctxt->sax != NULL) &&
+ (!ctxt->disableSAX) &&
+ (ctxt->sax->entityDecl != NULL))
+ ctxt->sax->entityDecl(ctxt->userData, name,
+ XML_EXTERNAL_PARAMETER_ENTITY,
+ literal, URI, NULL);
+ }
+ xmlFreeURI(uri);
+ }
}
}
} else {
@@ -4757,6 +4765,31 @@
ctxt->wellFormed = 0;
ctxt->disableSAX = 1;
}
+ if (URI) {
+ xmlURIPtr uri;
+
+ uri = xmlParseURI((const char *)URI);
+ if (uri == NULL) {
+ if ((ctxt->sax != NULL) &&
+ (!ctxt->disableSAX) &&
+ (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "Invalid URI: %s\n", URI);
+ ctxt->wellFormed = 0;
+ ctxt->errNo = XML_ERR_INVALID_URI;
+ } else {
+ if (uri->fragment != NULL) {
+ if ((ctxt->sax != NULL) &&
+ (!ctxt->disableSAX) &&
+ (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "Fragment not allowed: %s\n", URI);
+ ctxt->wellFormed = 0;
+ ctxt->errNo = XML_ERR_URI_FRAGMENT;
+ }
+ xmlFreeURI(uri);
+ }
+ }
if ((RAW != '>') && (!IS_BLANK(CUR))) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
@@ -5973,7 +6006,20 @@
/*
* We know that '<?xml' is here.
*/
- SKIP(5);
+ if ((RAW == '<') && (NXT(1) == '?') &&
+ (NXT(2) == 'x') && (NXT(3) == 'm') &&
+ (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
+ SKIP(5);
+ } else {
+ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "Text declaration '<?xml' required\n");
+ ctxt->errNo = XML_ERR_XMLDECL_NOT_STARTED;
+ ctxt->wellFormed = 0;
+ ctxt->disableSAX = 1;
+
+ return;
+ }
if (!IS_BLANK(CUR)) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
@@ -6003,7 +6049,13 @@
ctxt->wellFormed = 0;
ctxt->disableSAX = 1;
}
- ctxt->input->encoding = xmlParseEncodingDecl(ctxt);
+ xmlParseEncodingDecl(ctxt);
+ if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+ /*
+ * The XML REC instructs us to stop parsing right here
+ */
+ return;
+ }
SKIP_BLANKS;
if ((RAW == '?') && (NXT(1) == '>')) {
@@ -6192,6 +6244,13 @@
(NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l')) {
xmlParseTextDecl(ctxt);
+ if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+ /*
+ * The XML REC instructs us to stop parsing right here
+ */
+ ctxt->instate = XML_PARSER_EOF;
+ return;
+ }
}
if (ctxt->myDoc == NULL) {
ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0");
@@ -6441,6 +6500,13 @@
(NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
xmlParseTextDecl(ctxt);
+ if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+ /*
+ * The XML REC instructs us to stop parsing right here
+ */
+ ctxt->instate = XML_PARSER_EOF;
+ return;
+ }
if (input->standalone) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
@@ -6947,6 +7013,15 @@
(NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
xmlParseTextDecl(ctxt);
+ if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+ /*
+ * The XML REC instructs us to stop parsing
+ * right here
+ */
+ ctxt->instate = XML_PARSER_EOF;
+ xmlFree(name);
+ return;
+ }
}
if (ctxt->token == 0)
ctxt->token = ' ';
@@ -8197,6 +8272,38 @@
ctxt->disableSAX = 1;
ctxt->errNo = XML_ERR_STRING_NOT_STARTED;
}
+ if (encoding != NULL) {
+ xmlCharEncoding enc;
+ xmlCharEncodingHandlerPtr handler;
+
+ if (ctxt->input->encoding != NULL)
+ xmlFree((xmlChar *) ctxt->input->encoding);
+ ctxt->input->encoding = encoding;
+
+ enc = xmlParseCharEncoding((const char *) encoding);
+ /*
+ * registered set of known encodings
+ */
+ if (enc != XML_CHAR_ENCODING_ERROR) {
+ xmlSwitchEncoding(ctxt, enc);
+ if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+ xmlFree(encoding);
+ return(NULL);
+ }
+ } else {
+ /*
+ * fallback for unknown encodings
+ */
+ handler = xmlFindCharEncodingHandler((const char *) encoding);
+ if (handler != NULL) {
+ xmlSwitchToEncoding(ctxt, handler);
+ } else {
+ ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ xmlFree(encoding);
+ return(NULL);
+ }
+ }
+ }
}
return(encoding);
}
@@ -8362,7 +8469,13 @@
ctxt->wellFormed = 0;
ctxt->disableSAX = 1;
}
- ctxt->input->encoding = xmlParseEncodingDecl(ctxt);
+ xmlParseEncodingDecl(ctxt);
+ if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+ /*
+ * The XML REC instructs us to stop parsing right here
+ */
+ return;
+ }
/*
* We may have the standalone status.
@@ -8489,12 +8602,19 @@
if ((RAW == '<') && (NXT(1) == '?') &&
(NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
+
+ /*
+ * Note that we will switch encoding on the fly.
+ */
xmlParseXMLDecl(ctxt);
+ if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+ /*
+ * The XML REC instructs us to stop parsing right here
+ */
+ return(-1);
+ }
ctxt->standalone = ctxt->input->standalone;
SKIP_BLANKS;
- if ((ctxt->encoding == NULL) && (ctxt->input->encoding != NULL))
- ctxt->encoding = xmlStrdup(ctxt->input->encoding);
-
} else {
ctxt->version = xmlCharStrdup(XML_DEFAULT_VERSION);
}
@@ -8581,14 +8701,6 @@
(!ctxt->disableSAX))
ctxt->sax->endDocument(ctxt->userData);
- /*
- * Grab the encoding if it was added on-the-fly
- */
- if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) &&
- (ctxt->myDoc->encoding == NULL)) {
- ctxt->myDoc->encoding = ctxt->encoding;
- ctxt->encoding = NULL;
- }
if (! ctxt->wellFormed) return(-1);
return(0);
}
@@ -8805,6 +8917,14 @@
fprintf(stderr, "PP: Parsing XML Decl\n");
#endif
xmlParseXMLDecl(ctxt);
+ if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+ /*
+ * The XML REC instructs us to stop parsing right
+ * here
+ */
+ ctxt->instate = XML_PARSER_EOF;
+ return(0);
+ }
ctxt->standalone = ctxt->input->standalone;
if ((ctxt->encoding == NULL) &&
(ctxt->input->encoding != NULL))
diff --git a/parserInternals.h b/parserInternals.h
index c359734..e7e6fa0 100644
--- a/parserInternals.h
+++ b/parserInternals.h
@@ -28,10 +28,10 @@
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
*/
#define IS_CHAR(c) \
- ((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
- (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) && \
- (((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) && \
- ((c) <= 0x10FFFF))
+ (((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
+ (((c) >= 0x20) && ((c) <= 0xD7FF)) || \
+ (((c) >= 0xE000) && ((c) <= 0xFFFD)) || \
+ (((c) >= 0x10000) && ((c) <= 0x10FFFF)))
/*
* [3] S ::= (#x20 | #x9 | #xD | #xA)+
@@ -442,8 +442,10 @@
xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL,
const xmlChar *ID,
const xmlChar *base);
-void xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
+int xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
xmlCharEncoding enc);
+int xmlSwitchToEncoding (xmlParserCtxtPtr ctxt,
+ xmlCharEncodingHandlerPtr handler);
void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt);
/**
diff --git a/tree.c b/tree.c
index 2cc4b51..74b5321 100644
--- a/tree.c
+++ b/tree.c
@@ -3772,6 +3772,31 @@
}
/**
+ * xmlBufferGrow:
+ * @buf: the buffer
+ * @len: the minimum free sie to allocate
+ *
+ * Grow the available space of an XML buffer.
+ *
+ * Returns the new available space or -1 in case of error
+ */
+int
+xmlBufferGrow(xmlBufferPtr buf, int len) {
+ int size;
+ xmlChar *newbuf;
+
+ if (len <= buf->use) return(0);
+
+ size = buf->size + buf->use + len + 100;
+
+ newbuf = xmlRealloc(buf->content, size);
+ if (newbuf == NULL) return(-1);
+ buf->content = newbuf;
+ buf->size = size;
+ return(buf->size - buf->use);
+}
+
+/**
* xmlBufferDump:
* @file: the file output
* @buf: the buffer to dump
diff --git a/tree.h b/tree.h
index 1cb12e2..35ea525 100644
--- a/tree.h
+++ b/tree.h
@@ -380,6 +380,8 @@
const char *str);
int xmlBufferShrink (xmlBufferPtr buf,
int len);
+int xmlBufferGrow (xmlBufferPtr buf,
+ int len);
void xmlBufferEmpty (xmlBufferPtr buf);
const xmlChar* xmlBufferContent (const xmlBufferPtr buf);
int xmlBufferUse (const xmlBufferPtr buf);
diff --git a/uri.c b/uri.c
index 1a48113..6000d39 100644
--- a/uri.c
+++ b/uri.c
@@ -1284,6 +1284,34 @@
}
/**
+ * xmlParseURI:
+ * @str: the URI string to analyze
+ *
+ * Parse an URI
+ *
+ * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ *
+ * Returns a newly build xmlURIPtr or NULL in case of error
+ */
+xmlURIPtr
+xmlParseURI(const char *str) {
+ xmlURIPtr uri;
+ int ret;
+
+ if (str == NULL)
+ return(NULL);
+ uri = xmlCreateURI();
+ if (uri != NULL) {
+ ret = xmlParseURIReference(uri, str);
+ if (ret) {
+ xmlFreeURI(uri);
+ return(NULL);
+ }
+ }
+ return(uri);
+}
+
+/**
* xmlNormalizeURIPath:
* @path: pointer to the path string
*
diff --git a/xml-error.h b/xml-error.h
index 34f4e66..25d9db0 100644
--- a/xml-error.h
+++ b/xml-error.h
@@ -130,7 +130,9 @@
XML_ERR_ENTITY_CHAR_ERROR, /* 88 */
XML_ERR_ENTITY_PE_INTERNAL, /* 88 */
XML_ERR_ENTITY_LOOP, /* 89 */
- XML_ERR_ENTITY_BOUNDARY /* 90 */
+ XML_ERR_ENTITY_BOUNDARY, /* 90 */
+ XML_ERR_INVALID_URI, /* 91 */
+ XML_ERR_URI_FRAGMENT /* 92 */
}xmlParserErrors;
void xmlParserError (void *ctx,
diff --git a/xmlIO.c b/xmlIO.c
index 65f5632..841a6b6 100644
--- a/xmlIO.c
+++ b/xmlIO.c
@@ -498,6 +498,10 @@
}
ret->buffer->alloc = XML_BUFFER_ALLOC_DOUBLEIT;
ret->encoder = xmlGetCharEncodingHandler(enc);
+ if (ret->encoder != NULL)
+ ret->raw = xmlBufferCreate();
+ else
+ ret->raw = NULL;
ret->readcallback = NULL;
ret->closecallback = NULL;
ret->context = NULL;
@@ -513,13 +517,20 @@
*/
void
xmlFreeParserInputBuffer(xmlParserInputBufferPtr in) {
- if (in->buffer != NULL) {
- xmlBufferFree(in->buffer);
- in->buffer = NULL;
+ if (in->raw) {
+ xmlBufferFree(in->raw);
+ in->raw = NULL;
+ }
+ if (in->encoder != NULL) {
+ xmlCharEncCloseFunc(in->encoder);
}
if (in->closecallback != NULL) {
in->closecallback(in->context);
}
+ if (in->buffer != NULL) {
+ xmlBufferFree(in->buffer);
+ in->buffer = NULL;
+ }
memset(in, 0xbe, (size_t) sizeof(xmlParserInputBuffer));
xmlFree(in);
@@ -683,34 +694,22 @@
if (len < 0) return(0);
if (in->encoder != NULL) {
- xmlChar *buffer;
- int processed = len;
-
- buffer = (xmlChar *) xmlMalloc((len + 1) * 2 * sizeof(xmlChar));
- if (buffer == NULL) {
- fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n");
- return(-1);
- }
- nbchars = in->encoder->input(buffer, (len + 1) * 2 * sizeof(xmlChar),
- (xmlChar *) buf, &processed);
- /*
- * TODO : we really need to have something atomic or the
- * encoder must report the number of bytes read
+ /*
+ * Store the data in the incoming raw buffer
*/
+ if (in->raw == NULL) {
+ in->raw = xmlBufferCreate();
+ }
+ xmlBufferAdd(in->raw, (const xmlChar *) buf, len);
+
+ /*
+ * convert as much as possible to the parser reading buffer.
+ */
+ nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
if (nbchars < 0) {
fprintf(stderr, "xmlParserInputBufferPush: encoder error\n");
- xmlFree(buffer);
return(-1);
}
- if (processed != len) {
- fprintf(stderr,
- "TODO xmlParserInputBufferPush: processed != len\n");
- xmlFree(buffer);
- return(-1);
- }
- buffer[nbchars] = 0;
- xmlBufferAdd(in->buffer, (xmlChar *) buffer, nbchars);
- xmlFree(buffer);
} else {
nbchars = len;
xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars);
@@ -730,7 +729,9 @@
* Grow up the content of the input buffer, the old data are preserved
* This routine handle the I18N transcoding to internal UTF-8
* This routine is used when operating the parser in normal (pull) mode
- * TODO: one should be able to remove one extra copy
+ *
+ * TODO: one should be able to remove one extra copy by copying directy
+ * onto in->buffer or in->raw
*
* Returns the number of chars read and stored in the buffer, or -1
* in case of error.
@@ -779,34 +780,22 @@
return(-1);
}
if (in->encoder != NULL) {
- xmlChar *buf;
- int wrote = res;
-
- buf = (xmlChar *) xmlMalloc((res + 1) * 2 * sizeof(xmlChar));
- if (buf == NULL) {
- fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n");
- xmlFree(buffer);
- return(-1);
+ /*
+ * Store the data in the incoming raw buffer
+ */
+ if (in->raw == NULL) {
+ in->raw = xmlBufferCreate();
}
- nbchars = in->encoder->input(buf, (res + 1) * 2 * sizeof(xmlChar),
- BAD_CAST buffer, &wrote);
- buf[nbchars] = 0;
- xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars);
- xmlFree(buf);
+ xmlBufferAdd(in->raw, (const xmlChar *) buffer, len);
/*
- * Check that the encoder was able to process the full input
+ * convert as much as possible to the parser reading buffer.
*/
- if (wrote != res) {
- fprintf(stderr,
- "TODO : xmlParserInputBufferGrow wrote %d != res %d\n",
- wrote, res);
- /*
- * TODO !!!
- * Need to keep the unprocessed input in a buffer in->unprocessed
- */
+ nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
+ if (nbchars < 0) {
+ fprintf(stderr, "xmlParserInputBufferGrow: encoder error\n");
+ return(-1);
}
-
} else {
nbchars = res;
buffer[nbchars] = 0;
diff --git a/xmlIO.h b/xmlIO.h
index 8f9b7e0..2d14ebe 100644
--- a/xmlIO.h
+++ b/xmlIO.h
@@ -33,6 +33,7 @@
xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */
xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */
+ xmlBufferPtr raw; /* if encoder != NULL buffer for raw input */
};