Blame - encoding.c - platform/external/libxml2

blob: 06ebd2a353d4084768a8bcadd8d7bb1bba74fcaa [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* encoding.c : implements the encoding conversion functions needed for XML
				3	*
				4	* Related specs:
				5	* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
				6	* rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
				7	* [ISO-10646] UTF-8 and UTF-16 in Annexes
				8	* [ISO-8859-1] ISO Latin-1 characters codes.
				9	* [UNICODE] The Unicode Consortium, "The Unicode Standard --
				10	* Worldwide Character Encoding -- Version 1.0", Addison-
				11	* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
				12	* described in Unicode Technical Report #4.
				13	* [US-ASCII] Coded Character Set--7-bit American Standard Code for
				14	* Information Interchange, ANSI X3.4-1986.
				15	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	16	* See Copyright for the status of this software.
				17	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	18	* daniel@veillard.com
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	19	*
				20	* UTF8 string routines from:
				21	* "William M. Brack" <wbrack@mmm.com.hk>
				22	*
				23	* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	24	*/
				25
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	26	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	27
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	28	#include <string.h>
				29
				30	#ifdef HAVE_CTYPE_H
				31	#include <ctype.h>
				32	#endif
				33	#ifdef HAVE_STDLIB_H
				34	#include <stdlib.h>
				35	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	36	#ifdef LIBXML_ICONV_ENABLED
				37	#ifdef HAVE_ERRNO_H
				38	#include <errno.h>
				39	#endif
				40	#endif
				41	#include <libxml/encoding.h>
				42	#include <libxml/xmlmemory.h>
				43	#ifdef LIBXML_HTML_ENABLED
				44	#include <libxml/HTMLparser.h>
				45	#endif
Daniel Veillard	64a411c	2001-10-15 12:32:07 +0000	[diff] [blame]	46	#include <libxml/globals.h>
Daniel Veillard	a4617b8	2001-11-04 20:19:12 +0000	[diff] [blame]	47	#include <libxml/xmlerror.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	48
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	49	static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
				50	static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	51
				52	typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
				53	typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
				54	struct _xmlCharEncodingAlias {
				55	const char *name;
				56	const char *alias;
				57	};
				58
				59	static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
				60	static int xmlCharEncodingAliasesNb = 0;
				61	static int xmlCharEncodingAliasesMax = 0;
				62
				63	#ifdef LIBXML_ICONV_ENABLED
				64	#if 0
				65	#define DEBUG_ENCODING /* Define this to get encoding traces */
				66	#endif
				67	#endif
				68
				69	static int xmlLittleEndian = 1;
				70
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	71	/************************************************************************
				72	* *
				73	* Generic UTF8 handling routines *
				74	* *
				75	* From rfc2044: encoding of the Unicode values on UTF-8: *
				76	* *
				77	* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
				78	* 0000 0000-0000 007F 0xxxxxxx *
				79	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
				80	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
				81	* *
				82	* I hope we won't use values > 0xFFFF anytime soon ! *
				83	* *
				84	************************************************************************/
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	85
				86	/**
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	87	* xmlUTF8Strlen:
				88	* @utf: a sequence of UTF-8 encoded bytes
				89	*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	90	* compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	91	* checking of the content of the string.
				92	*
				93	* Returns the number of characters in the string or -1 in case of error
				94	*/
				95	int
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	96	xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	97	int ret = 0;
				98
				99	if (utf == NULL)
				100	return(-1);
				101
				102	while (*utf != 0) {
				103	if (utf[0] & 0x80) {
				104	if ((utf[1] & 0xc0) != 0x80)
				105	return(-1);
				106	if ((utf[0] & 0xe0) == 0xe0) {
				107	if ((utf[2] & 0xc0) != 0x80)
				108	return(-1);
				109	if ((utf[0] & 0xf0) == 0xf0) {
				110	if ((utf[0] & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				111	return(-1);
				112	utf += 4;
				113	} else {
				114	utf += 3;
				115	}
				116	} else {
				117	utf += 2;
				118	}
				119	} else {
				120	utf++;
				121	}
				122	ret++;
				123	}
				124	return(ret);
				125	}
				126
				127	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	128	* xmlGetUTF8Char:
				129	* @utf: a sequence of UTF-8 encoded bytes
				130	* @len: a pointer to @bytes len
				131	*
				132	* Read one UTF8 Char from @utf
				133	*
				134	* Returns the char value or -1 in case of error and update @len with the
				135	* number of bytes used
				136	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	137	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	138	xmlGetUTF8Char(const unsigned char utf, int len) {
				139	unsigned int c;
				140
				141	if (utf == NULL)
				142	goto error;
				143	if (len == NULL)
				144	goto error;
				145	if (*len < 1)
				146	goto error;
				147
				148	c = utf[0];
				149	if (c & 0x80) {
				150	if (*len < 2)
				151	goto error;
				152	if ((utf[1] & 0xc0) != 0x80)
				153	goto error;
				154	if ((c & 0xe0) == 0xe0) {
				155	if (*len < 3)
				156	goto error;
				157	if ((utf[2] & 0xc0) != 0x80)
				158	goto error;
				159	if ((c & 0xf0) == 0xf0) {
				160	if (*len < 4)
				161	goto error;
				162	if ((c & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				163	goto error;
				164	*len = 4;
				165	/* 4-byte code */
				166	c = (utf[0] & 0x7) << 18;
				167	c \|= (utf[1] & 0x3f) << 12;
				168	c \|= (utf[2] & 0x3f) << 6;
				169	c \|= utf[3] & 0x3f;
				170	} else {
				171	/* 3-byte code */
				172	*len = 3;
				173	c = (utf[0] & 0xf) << 12;
				174	c \|= (utf[1] & 0x3f) << 6;
				175	c \|= utf[2] & 0x3f;
				176	}
				177	} else {
				178	/* 2-byte code */
				179	*len = 2;
				180	c = (utf[0] & 0x1f) << 6;
				181	c \|= utf[1] & 0x3f;
				182	}
				183	} else {
				184	/* 1-byte code */
				185	*len = 1;
				186	}
				187	return(c);
				188
				189	error:
				190	*len = 0;
				191	return(-1);
				192	}
				193
				194	/**
				195	* xmlCheckUTF8: Check utf-8 string for legality.
				196	* @utf: Pointer to putative utf-8 encoded string.
				197	*
				198	* Checks @utf for being valid utf-8. @utf is assumed to be
				199	* null-terminated. This function is not super-strict, as it will
				200	* allow longer utf-8 sequences than necessary. Note that Java is
				201	* capable of producing these sequences if provoked. Also note, this
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	202	* routine checks for the 4-byte maximum size, but does not check for
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	203	* 0x10ffff maximum value.
				204	*
				205	* Return value: true if @utf is valid.
				206	**/
				207	int
				208	xmlCheckUTF8(const unsigned char *utf)
				209	{
				210	int ix;
				211	unsigned char c;
				212
				213	for (ix = 0; (c = utf[ix]);) {
				214	if (c & 0x80) {
				215	if ((utf[ix + 1] & 0xc0) != 0x80)
				216	return(0);
				217	if ((c & 0xe0) == 0xe0) {
				218	if ((utf[ix + 2] & 0xc0) != 0x80)
				219	return(0);
				220	if ((c & 0xf0) == 0xf0) {
				221	if ((c & 0xf8) != 0xf0 \|\| (utf[ix + 3] & 0xc0) != 0x80)
				222	return(0);
				223	ix += 4;
				224	/* 4-byte code */
				225	} else
				226	/* 3-byte code */
				227	ix += 3;
				228	} else
				229	/* 2-byte code */
				230	ix += 2;
				231	} else
				232	/* 1-byte code */
				233	ix++;
				234	}
				235	return(1);
				236	}
				237
				238	/**
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	239	* xmlUTF8Strsize:
				240	* @utf: a sequence of UTF-8 encoded bytes
				241	* @len: the number of characters in the array
				242	*
				243	* storage size of an UTF8 string
				244	*
				245	* Returns the storage size of
				246	* the first 'len' characters of ARRAY
				247	*
				248	*/
				249
				250	int
				251	xmlUTF8Strsize(const xmlChar *utf, int len) {
				252	const xmlChar *ptr=utf;
				253	xmlChar ch;
				254
				255	if (len <= 0)
				256	return(0);
				257
				258	while ( len-- > 0) {
				259	if ( !*ptr )
				260	break;
				261	if ( (ch = *ptr++) & 0x80)
				262	while ( (ch<<=1) & 0x80 )
				263	ptr++;
				264	}
				265	return (ptr - utf);
				266	}
				267
				268
				269	/**
				270	* xmlUTF8Strndup:
				271	* @utf: the input UTF8 *
				272	* @len: the len of @utf (in chars)
				273	*
				274	* a strndup for array of UTF8's
				275	*
				276	* Returns a new UTF8 * or NULL
				277	*/
				278	xmlChar *
				279	xmlUTF8Strndup(const xmlChar *utf, int len) {
				280	xmlChar *ret;
				281	int i;
				282
				283	if ((utf == NULL) \|\| (len < 0)) return(NULL);
				284	i = xmlUTF8Strsize(utf, len);
				285	ret = (xmlChar ) xmlMalloc((i + 1) sizeof(xmlChar));
				286	if (ret == NULL) {
				287	xmlGenericError(xmlGenericErrorContext,
				288	"malloc of %ld byte failed\n",
				289	(len + 1) * (long)sizeof(xmlChar));
				290	return(NULL);
				291	}
				292	memcpy(ret, utf, i * sizeof(xmlChar));
				293	ret[i] = 0;
				294	return(ret);
				295	}
				296
				297	/**
				298	* xmlUTF8Strpos:
				299	* @utf: the input UTF8 *
				300	* @pos: the position of the desired UTF8 char (in chars)
				301	*
				302	* a function to provide the equivalent of fetching a
				303	* character from a string array
				304	*
				305	* Returns a pointer to the UTF8 character or NULL
				306	*/
				307	xmlChar *
				308	xmlUTF8Strpos(const xmlChar *utf, int pos) {
				309	xmlChar ch;
				310
				311	if (utf == NULL) return(NULL);
				312	if ( (pos < 0) \|\| (pos >= xmlUTF8Strlen(utf)) )
				313	return(NULL);
				314	while (pos--) {
				315	if ((ch=*utf++) == 0) return(NULL);
				316	if ( ch & 0x80 ) {
				317	/* if not simple ascii, verify proper format */
				318	if ( (ch & 0xc0) != 0xc0 )
				319	return(NULL);
				320	/* then skip over remaining bytes for this char */
				321	while ( (ch <<= 1) & 0x80 )
				322	if ( (*utf++ & 0xc0) != 0x80 )
				323	return(NULL);
				324	}
				325	}
				326	return((xmlChar *)utf);
				327	}
				328
				329	/**
				330	* xmlUTF8Strloc:
				331	* @utf: the input UTF8 *
				332	* @utfchar: the UTF8 character to be found
				333	*
				334	* a function to provide relative location of a UTF8 char
				335	*
				336	* Returns the relative character position of the desired char
				337	* or -1 if not found
				338	*/
				339	int
				340	xmlUTF8Strloc(const xmlChar utf, const xmlChar utfchar) {
				341	int i, size;
				342	xmlChar ch;
				343
				344	if (utf==NULL \|\| utfchar==NULL) return -1;
				345	size = xmlUTF8Strsize(utfchar, 1);
				346	for(i=0; (ch=*utf) != 0; i++) {
				347	if (xmlStrncmp(utf, utfchar, size)==0)
				348	return(i);
				349	utf++;
				350	if ( ch & 0x80 ) {
				351	/* if not simple ascii, verify proper format */
				352	if ( (ch & 0xc0) != 0xc0 )
				353	return(-1);
				354	/* then skip over remaining bytes for this char */
				355	while ( (ch <<= 1) & 0x80 )
				356	if ( (*utf++ & 0xc0) != 0x80 )
				357	return(-1);
				358	}
				359	}
				360
				361	return(-1);
				362	}
				363	/**
				364	* xmlUTF8Strsub:
				365	* @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	366	* @start: relative pos of first char
				367	* @len: total number to copy
				368	*
				369	* Note: positions are given in units of UTF-8 chars
				370	*
				371	* Returns a pointer to a newly created string
				372	* or NULL if any problem
				373	*/
				374
				375	xmlChar *
				376	xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
				377	int i;
				378	xmlChar ch;
				379
				380	if (utf == NULL) return(NULL);
				381	if (start < 0) return(NULL);
				382	if (len < 0) return(NULL);
				383
				384	/*
				385	* Skip over any leading chars
				386	*/
				387	for (i = 0;i < start;i++) {
				388	if ((ch=*utf++) == 0) return(NULL);
				389	if ( ch & 0x80 ) {
				390	/* if not simple ascii, verify proper format */
				391	if ( (ch & 0xc0) != 0xc0 )
				392	return(NULL);
				393	/* then skip over remaining bytes for this char */
				394	while ( (ch <<= 1) & 0x80 )
				395	if ( (*utf++ & 0xc0) != 0x80 )
				396	return(NULL);
				397	}
				398	}
				399
				400	return(xmlUTF8Strndup(utf, len));
				401	}
				402
				403	/************************************************************************
				404	* *
				405	* Conversions To/From UTF8 encoding *
				406	* *
				407	************************************************************************/
				408
				409	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	410	* asciiToUTF8:
				411	* @out: a pointer to an array of bytes to store the result
				412	* @outlen: the length of @out
				413	* @in: a pointer to an array of ASCII chars
				414	* @inlen: the length of @in
				415	*
				416	* Take a block of ASCII chars in and try to convert it to an UTF-8
				417	* block of chars out.
				418	* Returns 0 if success, or -1 otherwise
				419	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	420	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	421	* The value of @outlen after return is the number of ocetes consumed.
				422	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	423	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	424	asciiToUTF8(unsigned char* out, int *outlen,
				425	const unsigned char* in, int *inlen) {
				426	unsigned char* outstart = out;
				427	const unsigned char* base = in;
				428	const unsigned char* processed = in;
				429	unsigned char* outend = out + *outlen;
				430	const unsigned char* inend;
				431	unsigned int c;
				432	int bits;
				433
				434	inend = in + (*inlen);
				435	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				436	c= *in++;
				437
				438	/* assertion: c is a single UTF-4 value */
				439	if (out >= outend)
				440	break;
				441	if (c < 0x80) { *out++= c; bits= -6; }
				442	else {
				443	*outlen = out - outstart;
				444	*inlen = processed - base;
				445	return(-1);
				446	}
				447
				448	for ( ; bits >= 0; bits-= 6) {
				449	if (out >= outend)
				450	break;
				451	*out++= ((c >> bits) & 0x3F) \| 0x80;
				452	}
				453	processed = (const unsigned char*) in;
				454	}
				455	*outlen = out - outstart;
				456	*inlen = processed - base;
				457	return(0);
				458	}
				459
				460	/**
				461	* UTF8Toascii:
				462	* @out: a pointer to an array of bytes to store the result
				463	* @outlen: the length of @out
				464	* @in: a pointer to an array of UTF-8 chars
				465	* @inlen: the length of @in
				466	*
				467	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				468	* block of chars out.
				469	*
				470	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				471	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	472	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	473	* The value of @outlen after return is the number of ocetes consumed.
				474	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	475	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	476	UTF8Toascii(unsigned char* out, int *outlen,
				477	const unsigned char* in, int *inlen) {
				478	const unsigned char* processed = in;
				479	const unsigned char* outend;
				480	const unsigned char* outstart = out;
				481	const unsigned char* instart = in;
				482	const unsigned char* inend;
				483	unsigned int c, d;
				484	int trailing;
				485
				486	if (in == NULL) {
				487	/*
				488	* initialization nothing to do
				489	*/
				490	*outlen = 0;
				491	*inlen = 0;
				492	return(0);
				493	}
				494	inend = in + (*inlen);
				495	outend = out + (*outlen);
				496	while (in < inend) {
				497	d = *in++;
				498	if (d < 0x80) { c= d; trailing= 0; }
				499	else if (d < 0xC0) {
				500	/* trailing byte in leading position */
				501	*outlen = out - outstart;
				502	*inlen = processed - instart;
				503	return(-2);
				504	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				505	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				506	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				507	else {
				508	/* no chance for this in Ascii */
				509	*outlen = out - outstart;
				510	*inlen = processed - instart;
				511	return(-2);
				512	}
				513
				514	if (inend - in < trailing) {
				515	break;
				516	}
				517
				518	for ( ; trailing; trailing--) {
				519	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				520	break;
				521	c <<= 6;
				522	c \|= d & 0x3F;
				523	}
				524
				525	/* assertion: c is a single UTF-4 value */
				526	if (c < 0x80) {
				527	if (out >= outend)
				528	break;
				529	*out++ = c;
				530	} else {
				531	/* no chance for this in Ascii */
				532	*outlen = out - outstart;
				533	*inlen = processed - instart;
				534	return(-2);
				535	}
				536	processed = in;
				537	}
				538	*outlen = out - outstart;
				539	*inlen = processed - instart;
				540	return(0);
				541	}
				542
				543	/**
				544	* isolat1ToUTF8:
				545	* @out: a pointer to an array of bytes to store the result
				546	* @outlen: the length of @out
				547	* @in: a pointer to an array of ISO Latin 1 chars
				548	* @inlen: the length of @in
				549	*
				550	* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
				551	* block of chars out.
				552	* Returns 0 if success, or -1 otherwise
				553	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	554	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	555	* The value of @outlen after return is the number of ocetes consumed.
				556	*/
				557	int
				558	isolat1ToUTF8(unsigned char* out, int *outlen,
				559	const unsigned char* in, int *inlen) {
				560	unsigned char* outstart = out;
				561	const unsigned char* base = in;
				562	const unsigned char* processed = in;
				563	unsigned char* outend = out + *outlen;
				564	const unsigned char* inend;
				565	unsigned int c;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	566
				567	inend = in + (*inlen);
Daniel Veillard	02141ea	2001-04-30 11:46:40 +0000	[diff] [blame]	568	while (in < inend) {
				569	c = *in++;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	570
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	571	if (out >= outend)
				572	break;
Daniel Veillard	02141ea	2001-04-30 11:46:40 +0000	[diff] [blame]	573
				574	if (c < 0x80) {
				575	*out++ = c;
				576	processed++;
				577	continue;
				578	} else {
				579	*out++= ((c >> 6) & 0x1F) \| 0xC0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	580	if (out >= outend)
Daniel Veillard	02141ea	2001-04-30 11:46:40 +0000	[diff] [blame]	581	break;
				582	*out++= (c & 0x3F) \| 0x80;
				583	processed++;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	584	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	585	}
				586	*outlen = out - outstart;
				587	*inlen = processed - base;
				588	return(0);
				589	}
				590
				591	/**
				592	* UTF8Toisolat1:
				593	* @out: a pointer to an array of bytes to store the result
				594	* @outlen: the length of @out
				595	* @in: a pointer to an array of UTF-8 chars
				596	* @inlen: the length of @in
				597	*
				598	* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
				599	* block of chars out.
				600	*
				601	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				602	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	603	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	604	* The value of @outlen after return is the number of ocetes consumed.
				605	*/
				606	int
				607	UTF8Toisolat1(unsigned char* out, int *outlen,
				608	const unsigned char* in, int *inlen) {
				609	const unsigned char* processed = in;
				610	const unsigned char* outend;
				611	const unsigned char* outstart = out;
				612	const unsigned char* instart = in;
				613	const unsigned char* inend;
				614	unsigned int c, d;
				615	int trailing;
				616
				617	if (in == NULL) {
				618	/*
				619	* initialization nothing to do
				620	*/
				621	*outlen = 0;
				622	*inlen = 0;
				623	return(0);
				624	}
				625	inend = in + (*inlen);
				626	outend = out + (*outlen);
				627	while (in < inend) {
				628	d = *in++;
				629	if (d < 0x80) { c= d; trailing= 0; }
				630	else if (d < 0xC0) {
				631	/* trailing byte in leading position */
				632	*outlen = out - outstart;
				633	*inlen = processed - instart;
				634	return(-2);
				635	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				636	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				637	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				638	else {
				639	/* no chance for this in IsoLat1 */
				640	*outlen = out - outstart;
				641	*inlen = processed - instart;
				642	return(-2);
				643	}
				644
				645	if (inend - in < trailing) {
				646	break;
				647	}
				648
				649	for ( ; trailing; trailing--) {
				650	if (in >= inend)
				651	break;
				652	if (((d= *in++) & 0xC0) != 0x80) {
				653	*outlen = out - outstart;
				654	*inlen = processed - instart;
				655	return(-2);
				656	}
				657	c <<= 6;
				658	c \|= d & 0x3F;
				659	}
				660
				661	/* assertion: c is a single UTF-4 value */
				662	if (c <= 0xFF) {
				663	if (out >= outend)
				664	break;
				665	*out++ = c;
				666	} else {
				667	/* no chance for this in IsoLat1 */
				668	*outlen = out - outstart;
				669	*inlen = processed - instart;
				670	return(-2);
				671	}
				672	processed = in;
				673	}
				674	*outlen = out - outstart;
				675	*inlen = processed - instart;
				676	return(0);
				677	}
				678
				679	/**
				680	* UTF16LEToUTF8:
				681	* @out: a pointer to an array of bytes to store the result
				682	* @outlen: the length of @out
				683	* @inb: a pointer to an array of UTF-16LE passwd as a byte array
				684	* @inlenb: the length of @in in UTF-16LE chars
				685	*
				686	* Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	687	* block of chars out. This function assume the endian property
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	688	* is the same between the native type of this machine and the
				689	* inputed one.
				690	*
				691	* Returns the number of byte written, or -1 by lack of space, or -2
				692	* if the transcoding fails (for *in is not valid utf16 string)
				693	* The value of *inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	694	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	695	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	696	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	697	UTF16LEToUTF8(unsigned char* out, int *outlen,
				698	const unsigned char* inb, int *inlenb)
				699	{
				700	unsigned char* outstart = out;
				701	const unsigned char* processed = inb;
				702	unsigned char* outend = out + *outlen;
				703	unsigned short* in = (unsigned short*) inb;
				704	unsigned short* inend;
				705	unsigned int c, d, inlen;
				706	unsigned char *tmp;
				707	int bits;
				708
				709	if ((*inlenb % 2) == 1)
				710	(*inlenb)--;
				711	inlen = *inlenb / 2;
				712	inend = in + inlen;
				713	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				714	if (xmlLittleEndian) {
				715	c= *in++;
				716	} else {
				717	tmp = (unsigned char *) in;
				718	c = *tmp++;
				719	c = c \| (((unsigned int)*tmp) << 8);
				720	in++;
				721	}
				722	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				723	if (in >= inend) { /* (in > inend) shouldn't happens */
				724	break;
				725	}
				726	if (xmlLittleEndian) {
				727	d = *in++;
				728	} else {
				729	tmp = (unsigned char *) in;
				730	d = *tmp++;
				731	d = d \| (((unsigned int)*tmp) << 8);
				732	in++;
				733	}
				734	if ((d & 0xFC00) == 0xDC00) {
				735	c &= 0x03FF;
				736	c <<= 10;
				737	c \|= d & 0x03FF;
				738	c += 0x10000;
				739	}
				740	else {
				741	*outlen = out - outstart;
				742	*inlenb = processed - inb;
				743	return(-2);
				744	}
				745	}
				746
				747	/* assertion: c is a single UTF-4 value */
				748	if (out >= outend)
				749	break;
				750	if (c < 0x80) { *out++= c; bits= -6; }
				751	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				752	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				753	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				754
				755	for ( ; bits >= 0; bits-= 6) {
				756	if (out >= outend)
				757	break;
				758	*out++= ((c >> bits) & 0x3F) \| 0x80;
				759	}
				760	processed = (const unsigned char*) in;
				761	}
				762	*outlen = out - outstart;
				763	*inlenb = processed - inb;
				764	return(0);
				765	}
				766
				767	/**
				768	* UTF8ToUTF16LE:
				769	* @outb: a pointer to an array of bytes to store the result
				770	* @outlen: the length of @outb
				771	* @in: a pointer to an array of UTF-8 chars
				772	* @inlen: the length of @in
				773	*
				774	* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
				775	* block of chars out.
				776	*
				777	* Returns the number of byte written, or -1 by lack of space, or -2
				778	* if the transcoding failed.
				779	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	780	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	781	UTF8ToUTF16LE(unsigned char* outb, int *outlen,
				782	const unsigned char* in, int *inlen)
				783	{
				784	unsigned short* out = (unsigned short*) outb;
				785	const unsigned char* processed = in;
				786	unsigned short* outstart= out;
				787	unsigned short* outend;
				788	const unsigned char* inend= in+*inlen;
				789	unsigned int c, d;
				790	int trailing;
				791	unsigned char *tmp;
				792	unsigned short tmp1, tmp2;
				793
				794	if (in == NULL) {
				795	/*
				796	* initialization, add the Byte Order Mark
				797	*/
				798	if (*outlen >= 2) {
				799	outb[0] = 0xFF;
				800	outb[1] = 0xFE;
				801	*outlen = 2;
				802	*inlen = 0;
				803	#ifdef DEBUG_ENCODING
				804	xmlGenericError(xmlGenericErrorContext,
				805	"Added FFFE Byte Order Mark\n");
				806	#endif
				807	return(2);
				808	}
				809	*outlen = 0;
				810	*inlen = 0;
				811	return(0);
				812	}
				813	outend = out + (*outlen / 2);
				814	while (in < inend) {
				815	d= *in++;
				816	if (d < 0x80) { c= d; trailing= 0; }
				817	else if (d < 0xC0) {
				818	/* trailing byte in leading position */
				819	outlen = (out - outstart) 2;
				820	*inlen = processed - in;
				821	return(-2);
				822	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				823	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				824	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				825	else {
				826	/* no chance for this in UTF-16 */
				827	outlen = (out - outstart) 2;
				828	*inlen = processed - in;
				829	return(-2);
				830	}
				831
				832	if (inend - in < trailing) {
				833	break;
				834	}
				835
				836	for ( ; trailing; trailing--) {
				837	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				838	break;
				839	c <<= 6;
				840	c \|= d & 0x3F;
				841	}
				842
				843	/* assertion: c is a single UTF-4 value */
				844	if (c < 0x10000) {
				845	if (out >= outend)
				846	break;
				847	if (xmlLittleEndian) {
				848	*out++ = c;
				849	} else {
				850	tmp = (unsigned char *) out;
				851	*tmp = c ;
				852	*(tmp + 1) = c >> 8 ;
				853	out++;
				854	}
				855	}
				856	else if (c < 0x110000) {
				857	if (out+1 >= outend)
				858	break;
				859	c -= 0x10000;
				860	if (xmlLittleEndian) {
				861	*out++ = 0xD800 \| (c >> 10);
				862	*out++ = 0xDC00 \| (c & 0x03FF);
				863	} else {
				864	tmp1 = 0xD800 \| (c >> 10);
				865	tmp = (unsigned char *) out;
				866	*tmp = (unsigned char) tmp1;
				867	*(tmp + 1) = tmp1 >> 8;
				868	out++;
				869
				870	tmp2 = 0xDC00 \| (c & 0x03FF);
				871	tmp = (unsigned char *) out;
				872	*tmp = (unsigned char) tmp2;
				873	*(tmp + 1) = tmp2 >> 8;
				874	out++;
				875	}
				876	}
				877	else
				878	break;
				879	processed = in;
				880	}
				881	outlen = (out - outstart) 2;
				882	*inlen = processed - in;
				883	return(0);
				884	}
				885
				886	/**
				887	* UTF16BEToUTF8:
				888	* @out: a pointer to an array of bytes to store the result
				889	* @outlen: the length of @out
				890	* @inb: a pointer to an array of UTF-16 passwd as a byte array
				891	* @inlenb: the length of @in in UTF-16 chars
				892	*
				893	* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	894	* block of chars out. This function assume the endian property
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	895	* is the same between the native type of this machine and the
				896	* inputed one.
				897	*
				898	* Returns the number of byte written, or -1 by lack of space, or -2
				899	* if the transcoding fails (for *in is not valid utf16 string)
				900	* The value of *inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	901	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	902	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	903	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	904	UTF16BEToUTF8(unsigned char* out, int *outlen,
				905	const unsigned char* inb, int *inlenb)
				906	{
				907	unsigned char* outstart = out;
				908	const unsigned char* processed = inb;
				909	unsigned char* outend = out + *outlen;
				910	unsigned short* in = (unsigned short*) inb;
				911	unsigned short* inend;
				912	unsigned int c, d, inlen;
				913	unsigned char *tmp;
				914	int bits;
				915
				916	if ((*inlenb % 2) == 1)
				917	(*inlenb)--;
				918	inlen = *inlenb / 2;
				919	inend= in + inlen;
				920	while (in < inend) {
				921	if (xmlLittleEndian) {
				922	tmp = (unsigned char *) in;
				923	c = *tmp++;
				924	c = c << 8;
				925	c = c \| (unsigned int) *tmp;
				926	in++;
				927	} else {
				928	c= *in++;
				929	}
				930	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				931	if (in >= inend) { /* (in > inend) shouldn't happens */
				932	*outlen = out - outstart;
				933	*inlenb = processed - inb;
				934	return(-2);
				935	}
				936	if (xmlLittleEndian) {
				937	tmp = (unsigned char *) in;
				938	d = *tmp++;
				939	d = d << 8;
				940	d = d \| (unsigned int) *tmp;
				941	in++;
				942	} else {
				943	d= *in++;
				944	}
				945	if ((d & 0xFC00) == 0xDC00) {
				946	c &= 0x03FF;
				947	c <<= 10;
				948	c \|= d & 0x03FF;
				949	c += 0x10000;
				950	}
				951	else {
				952	*outlen = out - outstart;
				953	*inlenb = processed - inb;
				954	return(-2);
				955	}
				956	}
				957
				958	/* assertion: c is a single UTF-4 value */
				959	if (out >= outend)
				960	break;
				961	if (c < 0x80) { *out++= c; bits= -6; }
				962	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				963	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				964	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				965
				966	for ( ; bits >= 0; bits-= 6) {
				967	if (out >= outend)
				968	break;
				969	*out++= ((c >> bits) & 0x3F) \| 0x80;
				970	}
				971	processed = (const unsigned char*) in;
				972	}
				973	*outlen = out - outstart;
				974	*inlenb = processed - inb;
				975	return(0);
				976	}
				977
				978	/**
				979	* UTF8ToUTF16BE:
				980	* @outb: a pointer to an array of bytes to store the result
				981	* @outlen: the length of @outb
				982	* @in: a pointer to an array of UTF-8 chars
				983	* @inlen: the length of @in
				984	*
				985	* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
				986	* block of chars out.
				987	*
				988	* Returns the number of byte written, or -1 by lack of space, or -2
				989	* if the transcoding failed.
				990	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	991	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	992	UTF8ToUTF16BE(unsigned char* outb, int *outlen,
				993	const unsigned char* in, int *inlen)
				994	{
				995	unsigned short* out = (unsigned short*) outb;
				996	const unsigned char* processed = in;
				997	unsigned short* outstart= out;
				998	unsigned short* outend;
				999	const unsigned char* inend= in+*inlen;
				1000	unsigned int c, d;
				1001	int trailing;
				1002	unsigned char *tmp;
				1003	unsigned short tmp1, tmp2;
				1004
				1005	if (in == NULL) {
				1006	/*
				1007	* initialization, add the Byte Order Mark
				1008	*/
				1009	if (*outlen >= 2) {
				1010	outb[0] = 0xFE;
				1011	outb[1] = 0xFF;
				1012	*outlen = 2;
				1013	*inlen = 0;
				1014	#ifdef DEBUG_ENCODING
				1015	xmlGenericError(xmlGenericErrorContext,
				1016	"Added FEFF Byte Order Mark\n");
				1017	#endif
				1018	return(2);
				1019	}
				1020	*outlen = 0;
				1021	*inlen = 0;
				1022	return(0);
				1023	}
				1024	outend = out + (*outlen / 2);
				1025	while (in < inend) {
				1026	d= *in++;
				1027	if (d < 0x80) { c= d; trailing= 0; }
				1028	else if (d < 0xC0) {
				1029	/* trailing byte in leading position */
				1030	*outlen = out - outstart;
				1031	*inlen = processed - in;
				1032	return(-2);
				1033	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1034	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1035	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1036	else {
				1037	/* no chance for this in UTF-16 */
				1038	*outlen = out - outstart;
				1039	*inlen = processed - in;
				1040	return(-2);
				1041	}
				1042
				1043	if (inend - in < trailing) {
				1044	break;
				1045	}
				1046
				1047	for ( ; trailing; trailing--) {
				1048	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80)) break;
				1049	c <<= 6;
				1050	c \|= d & 0x3F;
				1051	}
				1052
				1053	/* assertion: c is a single UTF-4 value */
				1054	if (c < 0x10000) {
				1055	if (out >= outend) break;
				1056	if (xmlLittleEndian) {
				1057	tmp = (unsigned char *) out;
				1058	*tmp = c >> 8;
				1059	*(tmp + 1) = c;
				1060	out++;
				1061	} else {
				1062	*out++ = c;
				1063	}
				1064	}
				1065	else if (c < 0x110000) {
				1066	if (out+1 >= outend) break;
				1067	c -= 0x10000;
				1068	if (xmlLittleEndian) {
				1069	tmp1 = 0xD800 \| (c >> 10);
				1070	tmp = (unsigned char *) out;
				1071	*tmp = tmp1 >> 8;
				1072	*(tmp + 1) = (unsigned char) tmp1;
				1073	out++;
				1074
				1075	tmp2 = 0xDC00 \| (c & 0x03FF);
				1076	tmp = (unsigned char *) out;
				1077	*tmp = tmp2 >> 8;
				1078	*(tmp + 1) = (unsigned char) tmp2;
				1079	out++;
				1080	} else {
				1081	*out++ = 0xD800 \| (c >> 10);
				1082	*out++ = 0xDC00 \| (c & 0x03FF);
				1083	}
				1084	}
				1085	else
				1086	break;
				1087	processed = in;
				1088	}
				1089	outlen = (out - outstart) 2;
				1090	*inlen = processed - in;
				1091	return(0);
				1092	}
				1093
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1094	/************************************************************************
				1095	* *
				1096	* Generic encoding handling routines *
				1097	* *
				1098	************************************************************************/
				1099
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1100	/**
				1101	* xmlDetectCharEncoding:
				1102	* @in: a pointer to the first bytes of the XML entity, must be at least
				1103	* 4 bytes long.
				1104	* @len: pointer to the length of the buffer
				1105	*
				1106	* Guess the encoding of the entity using the first bytes of the entity content
				1107	* accordingly of the non-normative appendix F of the XML-1.0 recommendation.
				1108	*
				1109	* Returns one of the XML_CHAR_ENCODING_... values.
				1110	*/
				1111	xmlCharEncoding
				1112	xmlDetectCharEncoding(const unsigned char* in, int len)
				1113	{
				1114	if (len >= 4) {
				1115	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				1116	(in[2] == 0x00) && (in[3] == 0x3C))
				1117	return(XML_CHAR_ENCODING_UCS4BE);
				1118	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
				1119	(in[2] == 0x00) && (in[3] == 0x00))
				1120	return(XML_CHAR_ENCODING_UCS4LE);
				1121	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				1122	(in[2] == 0x3C) && (in[3] == 0x00))
				1123	return(XML_CHAR_ENCODING_UCS4_2143);
				1124	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
				1125	(in[2] == 0x00) && (in[3] == 0x00))
				1126	return(XML_CHAR_ENCODING_UCS4_3412);
				1127	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
				1128	(in[2] == 0xA7) && (in[3] == 0x94))
				1129	return(XML_CHAR_ENCODING_EBCDIC);
				1130	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
				1131	(in[2] == 0x78) && (in[3] == 0x6D))
				1132	return(XML_CHAR_ENCODING_UTF8);
				1133	}
Daniel Veillard	87a764e	2001-06-20 17:41:10 +0000	[diff] [blame]	1134	if (len >= 3) {
				1135	/*
				1136	* Errata on XML-1.0 June 20 2001
				1137	* We now allow an UTF8 encoded BOM
				1138	*/
				1139	if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
				1140	(in[2] == 0xBF))
				1141	return(XML_CHAR_ENCODING_UTF8);
				1142	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1143	if (len >= 2) {
				1144	if ((in[0] == 0xFE) && (in[1] == 0xFF))
				1145	return(XML_CHAR_ENCODING_UTF16BE);
				1146	if ((in[0] == 0xFF) && (in[1] == 0xFE))
				1147	return(XML_CHAR_ENCODING_UTF16LE);
				1148	}
				1149	return(XML_CHAR_ENCODING_NONE);
				1150	}
				1151
				1152	/**
				1153	* xmlCleanupEncodingAliases:
				1154	*
				1155	* Unregisters all aliases
				1156	*/
				1157	void
				1158	xmlCleanupEncodingAliases(void) {
				1159	int i;
				1160
				1161	if (xmlCharEncodingAliases == NULL)
				1162	return;
				1163
				1164	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1165	if (xmlCharEncodingAliases[i].name != NULL)
				1166	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1167	if (xmlCharEncodingAliases[i].alias != NULL)
				1168	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				1169	}
				1170	xmlCharEncodingAliasesNb = 0;
				1171	xmlCharEncodingAliasesMax = 0;
				1172	xmlFree(xmlCharEncodingAliases);
Daniel Veillard	73c6e53	2002-01-08 13:15:33 +0000	[diff] [blame]	1173	xmlCharEncodingAliases = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1174	}
				1175
				1176	/**
				1177	* xmlGetEncodingAlias:
				1178	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1179	*
				1180	* Lookup an encoding name for the given alias.
				1181	*
				1182	* Returns NULL if not found the original name otherwise
				1183	*/
				1184	const char *
				1185	xmlGetEncodingAlias(const char *alias) {
				1186	int i;
				1187	char upper[100];
				1188
				1189	if (alias == NULL)
				1190	return(NULL);
				1191
				1192	if (xmlCharEncodingAliases == NULL)
				1193	return(NULL);
				1194
				1195	for (i = 0;i < 99;i++) {
				1196	upper[i] = toupper(alias[i]);
				1197	if (upper[i] == 0) break;
				1198	}
				1199	upper[i] = 0;
				1200
				1201	/*
				1202	* Walk down the list looking for a definition of the alias
				1203	*/
				1204	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1205	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1206	return(xmlCharEncodingAliases[i].name);
				1207	}
				1208	}
				1209	return(NULL);
				1210	}
				1211
				1212	/**
				1213	* xmlAddEncodingAlias:
				1214	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1215	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1216	*
				1217	* Registers and alias @alias for an encoding named @name. Existing alias
				1218	* will be overwritten.
				1219	*
				1220	* Returns 0 in case of success, -1 in case of error
				1221	*/
				1222	int
				1223	xmlAddEncodingAlias(const char name, const char alias) {
				1224	int i;
				1225	char upper[100];
				1226
				1227	if ((name == NULL) \|\| (alias == NULL))
				1228	return(-1);
				1229
				1230	for (i = 0;i < 99;i++) {
				1231	upper[i] = toupper(alias[i]);
				1232	if (upper[i] == 0) break;
				1233	}
				1234	upper[i] = 0;
				1235
				1236	if (xmlCharEncodingAliases == NULL) {
				1237	xmlCharEncodingAliasesNb = 0;
				1238	xmlCharEncodingAliasesMax = 20;
				1239	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1240	xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1241	if (xmlCharEncodingAliases == NULL)
				1242	return(-1);
				1243	} else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
				1244	xmlCharEncodingAliasesMax *= 2;
				1245	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1246	xmlRealloc(xmlCharEncodingAliases,
				1247	xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1248	}
				1249	/*
				1250	* Walk down the list looking for a definition of the alias
				1251	*/
				1252	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1253	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1254	/*
				1255	* Replace the definition.
				1256	*/
				1257	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1258	xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
				1259	return(0);
				1260	}
				1261	}
				1262	/*
				1263	* Add the definition
				1264	*/
				1265	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
				1266	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
				1267	xmlCharEncodingAliasesNb++;
				1268	return(0);
				1269	}
				1270
				1271	/**
				1272	* xmlDelEncodingAlias:
				1273	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1274	*
				1275	* Unregisters an encoding alias @alias
				1276	*
				1277	* Returns 0 in case of success, -1 in case of error
				1278	*/
				1279	int
				1280	xmlDelEncodingAlias(const char *alias) {
				1281	int i;
				1282
				1283	if (alias == NULL)
				1284	return(-1);
				1285
				1286	if (xmlCharEncodingAliases == NULL)
				1287	return(-1);
				1288	/*
				1289	* Walk down the list looking for a definition of the alias
				1290	*/
				1291	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1292	if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
				1293	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1294	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				1295	xmlCharEncodingAliasesNb--;
				1296	memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
				1297	sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
				1298	return(0);
				1299	}
				1300	}
				1301	return(-1);
				1302	}
				1303
				1304	/**
				1305	* xmlParseCharEncoding:
				1306	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1307	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1308	* Compare the string to the known encoding schemes already known. Note
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1309	* that the comparison is case insensitive accordingly to the section
				1310	* [XML] 4.3.3 Character Encoding in Entities.
				1311	*
				1312	* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
				1313	* if not recognized.
				1314	*/
				1315	xmlCharEncoding
				1316	xmlParseCharEncoding(const char* name)
				1317	{
				1318	const char *alias;
				1319	char upper[500];
				1320	int i;
				1321
				1322	if (name == NULL)
				1323	return(XML_CHAR_ENCODING_NONE);
				1324
				1325	/*
				1326	* Do the alias resolution
				1327	*/
				1328	alias = xmlGetEncodingAlias(name);
				1329	if (alias != NULL)
				1330	name = alias;
				1331
				1332	for (i = 0;i < 499;i++) {
				1333	upper[i] = toupper(name[i]);
				1334	if (upper[i] == 0) break;
				1335	}
				1336	upper[i] = 0;
				1337
				1338	if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
				1339	if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
				1340	if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
				1341
				1342	/*
				1343	* NOTE: if we were able to parse this, the endianness of UTF16 is
				1344	* already found and in use
				1345	*/
				1346	if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
				1347	if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
				1348
				1349	if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1350	if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1351	if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
				1352
				1353	/*
				1354	* NOTE: if we were able to parse this, the endianness of UCS4 is
				1355	* already found and in use
				1356	*/
				1357	if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1358	if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1359	if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
				1360
				1361
				1362	if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
				1363	if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
				1364	if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
				1365
				1366	if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
				1367	if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
				1368	if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
				1369
				1370	if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
				1371	if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
				1372	if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
				1373	if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
				1374	if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
				1375	if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
				1376	if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
				1377
				1378	if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
				1379	if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
				1380	if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
				1381
				1382	#ifdef DEBUG_ENCODING
				1383	xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
				1384	#endif
				1385	return(XML_CHAR_ENCODING_ERROR);
				1386	}
				1387
				1388	/**
				1389	* xmlGetCharEncodingName:
				1390	* @enc: the encoding
				1391	*
				1392	* The "canonical" name for XML encoding.
				1393	* C.f. http://www.w3.org/TR/REC-xml#charencoding
				1394	* Section 4.3.3 Character Encoding in Entities
				1395	*
				1396	* Returns the canonical name for the given encoding
				1397	*/
				1398
				1399	const char*
				1400	xmlGetCharEncodingName(xmlCharEncoding enc) {
				1401	switch (enc) {
				1402	case XML_CHAR_ENCODING_ERROR:
				1403	return(NULL);
				1404	case XML_CHAR_ENCODING_NONE:
				1405	return(NULL);
				1406	case XML_CHAR_ENCODING_UTF8:
				1407	return("UTF-8");
				1408	case XML_CHAR_ENCODING_UTF16LE:
				1409	return("UTF-16");
				1410	case XML_CHAR_ENCODING_UTF16BE:
				1411	return("UTF-16");
				1412	case XML_CHAR_ENCODING_EBCDIC:
				1413	return("EBCDIC");
				1414	case XML_CHAR_ENCODING_UCS4LE:
				1415	return("ISO-10646-UCS-4");
				1416	case XML_CHAR_ENCODING_UCS4BE:
				1417	return("ISO-10646-UCS-4");
				1418	case XML_CHAR_ENCODING_UCS4_2143:
				1419	return("ISO-10646-UCS-4");
				1420	case XML_CHAR_ENCODING_UCS4_3412:
				1421	return("ISO-10646-UCS-4");
				1422	case XML_CHAR_ENCODING_UCS2:
				1423	return("ISO-10646-UCS-2");
				1424	case XML_CHAR_ENCODING_8859_1:
				1425	return("ISO-8859-1");
				1426	case XML_CHAR_ENCODING_8859_2:
				1427	return("ISO-8859-2");
				1428	case XML_CHAR_ENCODING_8859_3:
				1429	return("ISO-8859-3");
				1430	case XML_CHAR_ENCODING_8859_4:
				1431	return("ISO-8859-4");
				1432	case XML_CHAR_ENCODING_8859_5:
				1433	return("ISO-8859-5");
				1434	case XML_CHAR_ENCODING_8859_6:
				1435	return("ISO-8859-6");
				1436	case XML_CHAR_ENCODING_8859_7:
				1437	return("ISO-8859-7");
				1438	case XML_CHAR_ENCODING_8859_8:
				1439	return("ISO-8859-8");
				1440	case XML_CHAR_ENCODING_8859_9:
				1441	return("ISO-8859-9");
				1442	case XML_CHAR_ENCODING_2022_JP:
				1443	return("ISO-2022-JP");
				1444	case XML_CHAR_ENCODING_SHIFT_JIS:
				1445	return("Shift-JIS");
				1446	case XML_CHAR_ENCODING_EUC_JP:
				1447	return("EUC-JP");
				1448	case XML_CHAR_ENCODING_ASCII:
				1449	return(NULL);
				1450	}
				1451	return(NULL);
				1452	}
				1453
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1454	/************************************************************************
				1455	* *
				1456	* Char encoding handlers *
				1457	* *
				1458	************************************************************************/
				1459
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1460
				1461	/* the size should be growable, but it's not a big deal ... */
				1462	#define MAX_ENCODING_HANDLERS 50
				1463	static xmlCharEncodingHandlerPtr *handlers = NULL;
				1464	static int nbCharEncodingHandler = 0;
				1465
				1466	/*
				1467	* The default is UTF-8 for XML, that's also the default used for the
				1468	* parser internals, so the default encoding handler is NULL
				1469	*/
				1470
				1471	static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
				1472
				1473	/**
				1474	* xmlNewCharEncodingHandler:
				1475	* @name: the encoding name, in UTF-8 format (ASCII actually)
				1476	* @input: the xmlCharEncodingInputFunc to read that encoding
				1477	* @output: the xmlCharEncodingOutputFunc to write that encoding
				1478	*
				1479	* Create and registers an xmlCharEncodingHandler.
				1480	* Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
				1481	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1482	static xmlCharEncodingHandlerPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1483	xmlNewCharEncodingHandler(const char *name,
				1484	xmlCharEncodingInputFunc input,
				1485	xmlCharEncodingOutputFunc output) {
				1486	xmlCharEncodingHandlerPtr handler;
				1487	const char *alias;
				1488	char upper[500];
				1489	int i;
				1490	char *up = 0;
				1491
				1492	/*
				1493	* Do the alias resolution
				1494	*/
				1495	alias = xmlGetEncodingAlias(name);
				1496	if (alias != NULL)
				1497	name = alias;
				1498
				1499	/*
				1500	* Keep only the uppercase version of the encoding.
				1501	*/
				1502	if (name == NULL) {
				1503	xmlGenericError(xmlGenericErrorContext,
				1504	"xmlNewCharEncodingHandler : no name !\n");
				1505	return(NULL);
				1506	}
				1507	for (i = 0;i < 499;i++) {
				1508	upper[i] = toupper(name[i]);
				1509	if (upper[i] == 0) break;
				1510	}
				1511	upper[i] = 0;
				1512	up = xmlMemStrdup(upper);
				1513	if (up == NULL) {
				1514	xmlGenericError(xmlGenericErrorContext,
				1515	"xmlNewCharEncodingHandler : out of memory !\n");
				1516	return(NULL);
				1517	}
				1518
				1519	/*
				1520	* allocate and fill-up an handler block.
				1521	*/
				1522	handler = (xmlCharEncodingHandlerPtr)
				1523	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1524	if (handler == NULL) {
				1525	xmlGenericError(xmlGenericErrorContext,
				1526	"xmlNewCharEncodingHandler : out of memory !\n");
				1527	return(NULL);
				1528	}
				1529	handler->input = input;
				1530	handler->output = output;
				1531	handler->name = up;
				1532
				1533	#ifdef LIBXML_ICONV_ENABLED
				1534	handler->iconv_in = NULL;
				1535	handler->iconv_out = NULL;
				1536	#endif /* LIBXML_ICONV_ENABLED */
				1537
				1538	/*
				1539	* registers and returns the handler.
				1540	*/
				1541	xmlRegisterCharEncodingHandler(handler);
				1542	#ifdef DEBUG_ENCODING
				1543	xmlGenericError(xmlGenericErrorContext,
				1544	"Registered encoding handler for %s\n", name);
				1545	#endif
				1546	return(handler);
				1547	}
				1548
				1549	/**
				1550	* xmlInitCharEncodingHandlers:
				1551	*
				1552	* Initialize the char encoding support, it registers the default
				1553	* encoding supported.
				1554	* NOTE: while public, this function usually doesn't need to be called
				1555	* in normal processing.
				1556	*/
				1557	void
				1558	xmlInitCharEncodingHandlers(void) {
				1559	unsigned short int tst = 0x1234;
				1560	unsigned char ptr = (unsigned char ) &tst;
				1561
				1562	if (handlers != NULL) return;
				1563
				1564	handlers = (xmlCharEncodingHandlerPtr *)
				1565	xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
				1566
				1567	if (*ptr == 0x12) xmlLittleEndian = 0;
				1568	else if (*ptr == 0x34) xmlLittleEndian = 1;
				1569	else xmlGenericError(xmlGenericErrorContext,
				1570	"Odd problem at endianness detection\n");
				1571
				1572	if (handlers == NULL) {
				1573	xmlGenericError(xmlGenericErrorContext,
				1574	"xmlInitCharEncodingHandlers : out of memory !\n");
				1575	return;
				1576	}
				1577	xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
				1578	xmlUTF16LEHandler =
				1579	xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
				1580	xmlUTF16BEHandler =
				1581	xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
				1582	xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
				1583	xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard	2004242	2001-05-31 18:22:04 +0000	[diff] [blame]	1584	xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1585	#ifdef LIBXML_HTML_ENABLED
				1586	xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
				1587	#endif
				1588	}
				1589
				1590	/**
				1591	* xmlCleanupCharEncodingHandlers:
				1592	*
				1593	* Cleanup the memory allocated for the char encoding support, it
				1594	* unregisters all the encoding handlers and the aliases.
				1595	*/
				1596	void
				1597	xmlCleanupCharEncodingHandlers(void) {
				1598	xmlCleanupEncodingAliases();
				1599
				1600	if (handlers == NULL) return;
				1601
				1602	for (;nbCharEncodingHandler > 0;) {
				1603	nbCharEncodingHandler--;
				1604	if (handlers[nbCharEncodingHandler] != NULL) {
				1605	if (handlers[nbCharEncodingHandler]->name != NULL)
				1606	xmlFree(handlers[nbCharEncodingHandler]->name);
				1607	xmlFree(handlers[nbCharEncodingHandler]);
				1608	}
				1609	}
				1610	xmlFree(handlers);
				1611	handlers = NULL;
				1612	nbCharEncodingHandler = 0;
				1613	xmlDefaultCharEncodingHandler = NULL;
				1614	}
				1615
				1616	/**
				1617	* xmlRegisterCharEncodingHandler:
				1618	* @handler: the xmlCharEncodingHandlerPtr handler block
				1619	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1620	* Register the char encoding handler, surprising, isn't it ?
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1621	*/
				1622	void
				1623	xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
				1624	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1625	if (handler == NULL) {
				1626	xmlGenericError(xmlGenericErrorContext,
				1627	"xmlRegisterCharEncodingHandler: NULL handler !\n");
				1628	return;
				1629	}
				1630
				1631	if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
				1632	xmlGenericError(xmlGenericErrorContext,
				1633	"xmlRegisterCharEncodingHandler: Too many handler registered\n");
				1634	xmlGenericError(xmlGenericErrorContext,
				1635	"\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
				1636	return;
				1637	}
				1638	handlers[nbCharEncodingHandler++] = handler;
				1639	}
				1640
				1641	/**
				1642	* xmlGetCharEncodingHandler:
				1643	* @enc: an xmlCharEncoding value.
				1644	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1645	* Search in the registered set the handler able to read/write that encoding.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1646	*
				1647	* Returns the handler or NULL if not found
				1648	*/
				1649	xmlCharEncodingHandlerPtr
				1650	xmlGetCharEncodingHandler(xmlCharEncoding enc) {
				1651	xmlCharEncodingHandlerPtr handler;
				1652
				1653	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1654	switch (enc) {
				1655	case XML_CHAR_ENCODING_ERROR:
				1656	return(NULL);
				1657	case XML_CHAR_ENCODING_NONE:
				1658	return(NULL);
				1659	case XML_CHAR_ENCODING_UTF8:
				1660	return(NULL);
				1661	case XML_CHAR_ENCODING_UTF16LE:
				1662	return(xmlUTF16LEHandler);
				1663	case XML_CHAR_ENCODING_UTF16BE:
				1664	return(xmlUTF16BEHandler);
				1665	case XML_CHAR_ENCODING_EBCDIC:
				1666	handler = xmlFindCharEncodingHandler("EBCDIC");
				1667	if (handler != NULL) return(handler);
				1668	handler = xmlFindCharEncodingHandler("ebcdic");
				1669	if (handler != NULL) return(handler);
				1670	break;
				1671	case XML_CHAR_ENCODING_UCS4BE:
				1672	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1673	if (handler != NULL) return(handler);
				1674	handler = xmlFindCharEncodingHandler("UCS-4");
				1675	if (handler != NULL) return(handler);
				1676	handler = xmlFindCharEncodingHandler("UCS4");
				1677	if (handler != NULL) return(handler);
				1678	break;
				1679	case XML_CHAR_ENCODING_UCS4LE:
				1680	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1681	if (handler != NULL) return(handler);
				1682	handler = xmlFindCharEncodingHandler("UCS-4");
				1683	if (handler != NULL) return(handler);
				1684	handler = xmlFindCharEncodingHandler("UCS4");
				1685	if (handler != NULL) return(handler);
				1686	break;
				1687	case XML_CHAR_ENCODING_UCS4_2143:
				1688	break;
				1689	case XML_CHAR_ENCODING_UCS4_3412:
				1690	break;
				1691	case XML_CHAR_ENCODING_UCS2:
				1692	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
				1693	if (handler != NULL) return(handler);
				1694	handler = xmlFindCharEncodingHandler("UCS-2");
				1695	if (handler != NULL) return(handler);
				1696	handler = xmlFindCharEncodingHandler("UCS2");
				1697	if (handler != NULL) return(handler);
				1698	break;
				1699
				1700	/*
				1701	* We used to keep ISO Latin encodings native in the
				1702	* generated data. This led to so many problems that
				1703	* this has been removed. One can still change this
				1704	* back by registering no-ops encoders for those
				1705	*/
				1706	case XML_CHAR_ENCODING_8859_1:
				1707	handler = xmlFindCharEncodingHandler("ISO-8859-1");
				1708	if (handler != NULL) return(handler);
				1709	break;
				1710	case XML_CHAR_ENCODING_8859_2:
				1711	handler = xmlFindCharEncodingHandler("ISO-8859-2");
				1712	if (handler != NULL) return(handler);
				1713	break;
				1714	case XML_CHAR_ENCODING_8859_3:
				1715	handler = xmlFindCharEncodingHandler("ISO-8859-3");
				1716	if (handler != NULL) return(handler);
				1717	break;
				1718	case XML_CHAR_ENCODING_8859_4:
				1719	handler = xmlFindCharEncodingHandler("ISO-8859-4");
				1720	if (handler != NULL) return(handler);
				1721	break;
				1722	case XML_CHAR_ENCODING_8859_5:
				1723	handler = xmlFindCharEncodingHandler("ISO-8859-5");
				1724	if (handler != NULL) return(handler);
				1725	break;
				1726	case XML_CHAR_ENCODING_8859_6:
				1727	handler = xmlFindCharEncodingHandler("ISO-8859-6");
				1728	if (handler != NULL) return(handler);
				1729	break;
				1730	case XML_CHAR_ENCODING_8859_7:
				1731	handler = xmlFindCharEncodingHandler("ISO-8859-7");
				1732	if (handler != NULL) return(handler);
				1733	break;
				1734	case XML_CHAR_ENCODING_8859_8:
				1735	handler = xmlFindCharEncodingHandler("ISO-8859-8");
				1736	if (handler != NULL) return(handler);
				1737	break;
				1738	case XML_CHAR_ENCODING_8859_9:
				1739	handler = xmlFindCharEncodingHandler("ISO-8859-9");
				1740	if (handler != NULL) return(handler);
				1741	break;
				1742
				1743
				1744	case XML_CHAR_ENCODING_2022_JP:
				1745	handler = xmlFindCharEncodingHandler("ISO-2022-JP");
				1746	if (handler != NULL) return(handler);
				1747	break;
				1748	case XML_CHAR_ENCODING_SHIFT_JIS:
				1749	handler = xmlFindCharEncodingHandler("SHIFT-JIS");
				1750	if (handler != NULL) return(handler);
				1751	handler = xmlFindCharEncodingHandler("SHIFT_JIS");
				1752	if (handler != NULL) return(handler);
				1753	handler = xmlFindCharEncodingHandler("Shift_JIS");
				1754	if (handler != NULL) return(handler);
				1755	break;
				1756	case XML_CHAR_ENCODING_EUC_JP:
				1757	handler = xmlFindCharEncodingHandler("EUC-JP");
				1758	if (handler != NULL) return(handler);
				1759	break;
				1760	default:
				1761	break;
				1762	}
				1763
				1764	#ifdef DEBUG_ENCODING
				1765	xmlGenericError(xmlGenericErrorContext,
				1766	"No handler found for encoding %d\n", enc);
				1767	#endif
				1768	return(NULL);
				1769	}
				1770
				1771	/**
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1772	* xmlFindCharEncodingHandler:
				1773	* @name: a string describing the char encoding.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1774	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1775	* Search in the registered set the handler able to read/write that encoding.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1776	*
				1777	* Returns the handler or NULL if not found
				1778	*/
				1779	xmlCharEncodingHandlerPtr
				1780	xmlFindCharEncodingHandler(const char *name) {
				1781	const char *nalias;
				1782	const char *norig;
				1783	xmlCharEncoding alias;
				1784	#ifdef LIBXML_ICONV_ENABLED
				1785	xmlCharEncodingHandlerPtr enc;
				1786	iconv_t icv_in, icv_out;
				1787	#endif /* LIBXML_ICONV_ENABLED */
				1788	char upper[100];
				1789	int i;
				1790
				1791	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1792	if (name == NULL) return(xmlDefaultCharEncodingHandler);
				1793	if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
				1794
				1795	/*
				1796	* Do the alias resolution
				1797	*/
				1798	norig = name;
				1799	nalias = xmlGetEncodingAlias(name);
				1800	if (nalias != NULL)
				1801	name = nalias;
				1802
				1803	/*
				1804	* Check first for directly registered encoding names
				1805	*/
				1806	for (i = 0;i < 99;i++) {
				1807	upper[i] = toupper(name[i]);
				1808	if (upper[i] == 0) break;
				1809	}
				1810	upper[i] = 0;
				1811
				1812	for (i = 0;i < nbCharEncodingHandler; i++)
				1813	if (!strcmp(upper, handlers[i]->name)) {
				1814	#ifdef DEBUG_ENCODING
				1815	xmlGenericError(xmlGenericErrorContext,
				1816	"Found registered handler for encoding %s\n", name);
				1817	#endif
				1818	return(handlers[i]);
				1819	}
				1820
				1821	#ifdef LIBXML_ICONV_ENABLED
				1822	/* check whether iconv can handle this */
				1823	icv_in = iconv_open("UTF-8", name);
				1824	icv_out = iconv_open(name, "UTF-8");
				1825	if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
				1826	enc = (xmlCharEncodingHandlerPtr)
				1827	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1828	if (enc == NULL) {
				1829	iconv_close(icv_in);
				1830	iconv_close(icv_out);
				1831	return(NULL);
				1832	}
				1833	enc->name = xmlMemStrdup(name);
				1834	enc->input = NULL;
				1835	enc->output = NULL;
				1836	enc->iconv_in = icv_in;
				1837	enc->iconv_out = icv_out;
				1838	#ifdef DEBUG_ENCODING
				1839	xmlGenericError(xmlGenericErrorContext,
				1840	"Found iconv handler for encoding %s\n", name);
				1841	#endif
				1842	return enc;
				1843	} else if ((icv_in != (iconv_t) -1) \|\| icv_out != (iconv_t) -1) {
				1844	xmlGenericError(xmlGenericErrorContext,
				1845	"iconv : problems with filters for '%s'\n", name);
				1846	}
				1847	#endif /* LIBXML_ICONV_ENABLED */
				1848
				1849	#ifdef DEBUG_ENCODING
				1850	xmlGenericError(xmlGenericErrorContext,
				1851	"No handler found for encoding %s\n", name);
				1852	#endif
				1853
				1854	/*
				1855	* Fallback using the canonical names
				1856	*/
				1857	alias = xmlParseCharEncoding(norig);
				1858	if (alias != XML_CHAR_ENCODING_ERROR) {
				1859	const char* canon;
				1860	canon = xmlGetCharEncodingName(alias);
				1861	if ((canon != NULL) && (strcmp(name, canon))) {
				1862	return(xmlFindCharEncodingHandler(canon));
				1863	}
				1864	}
				1865
				1866	return(NULL);
				1867	}
				1868
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1869	/************************************************************************
				1870	* *
				1871	* ICONV based generic conversion functions *
				1872	* *
				1873	************************************************************************/
				1874
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1875	#ifdef LIBXML_ICONV_ENABLED
				1876	/**
				1877	* xmlIconvWrapper:
				1878	* @cd: iconv converter data structure
				1879	* @out: a pointer to an array of bytes to store the result
				1880	* @outlen: the length of @out
				1881	* @in: a pointer to an array of ISO Latin 1 chars
				1882	* @inlen: the length of @in
				1883	*
				1884	* Returns 0 if success, or
				1885	* -1 by lack of space, or
				1886	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1887	* the result of transformation can't fit into the encoding we want), or
				1888	* -3 if there the last byte can't form a single output char.
				1889	*
				1890	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1891	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1892	* The value of @outlen after return is the number of ocetes consumed.
				1893	*/
				1894	static int
				1895	xmlIconvWrapper(iconv_t cd,
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1896	unsigned char out, int outlen,
				1897	const unsigned char in, int inlen) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1898
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1899	size_t icv_inlen = inlen, icv_outlen = outlen;
				1900	const char icv_in = (const char ) in;
				1901	char icv_out = (char ) out;
				1902	int ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1903
Darin Adler	699613b	2001-07-27 22:47:14 +0000	[diff] [blame]	1904	ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1905	if (in != NULL) {
				1906	*inlen -= icv_inlen;
				1907	*outlen -= icv_outlen;
				1908	} else {
				1909	*inlen = 0;
				1910	*outlen = 0;
				1911	}
				1912	if ((icv_inlen != 0) \|\| (ret == -1)) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1913	#ifdef EILSEQ
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1914	if (errno == EILSEQ) {
				1915	return -2;
				1916	} else
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1917	#endif
				1918	#ifdef E2BIG
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1919	if (errno == E2BIG) {
				1920	return -1;
				1921	} else
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1922	#endif
				1923	#ifdef EINVAL
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1924	if (errno == EINVAL) {
				1925	return -3;
				1926	} else
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1927	#endif
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1928	{
				1929	return -3;
				1930	}
				1931	}
				1932	return 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1933	}
				1934	#endif /* LIBXML_ICONV_ENABLED */
				1935
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1936	/************************************************************************
				1937	* *
				1938	* The real API used by libxml for on-the-fly conversion *
				1939	* *
				1940	************************************************************************/
				1941
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1942	/**
				1943	* xmlCharEncFirstLine:
				1944	* @handler: char enconding transformation data structure
				1945	* @out: an xmlBuffer for the output.
				1946	* @in: an xmlBuffer for the input
				1947	*
				1948	* Front-end for the encoding handler input function, but handle only
				1949	* the very first line, i.e. limit itself to 45 chars.
				1950	*
				1951	* Returns the number of byte written if success, or
				1952	* -1 general error
				1953	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1954	* the result of transformation can't fit into the encoding we want), or
				1955	*/
				1956	int
				1957	xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1958	xmlBufferPtr in) {
				1959	int ret = -2;
				1960	int written;
				1961	int toconv;
				1962
				1963	if (handler == NULL) return(-1);
				1964	if (out == NULL) return(-1);
				1965	if (in == NULL) return(-1);
				1966
				1967	written = out->size - out->use;
				1968	toconv = in->use;
				1969	if (toconv * 2 >= written) {
				1970	xmlBufferGrow(out, toconv);
				1971	written = out->size - out->use - 1;
				1972	}
				1973
				1974	/*
				1975	* echo '<?xml version="1.0" encoding="UCS4"?>' \| wc -c => 38
				1976	* 45 chars should be sufficient to reach the end of the encoding
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1977	* declaration without going too far inside the document content.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1978	*/
				1979	written = 45;
				1980
				1981	if (handler->input != NULL) {
				1982	ret = handler->input(&out->content[out->use], &written,
				1983	in->content, &toconv);
				1984	xmlBufferShrink(in, toconv);
				1985	out->use += written;
				1986	out->content[out->use] = 0;
				1987	}
				1988	#ifdef LIBXML_ICONV_ENABLED
				1989	else if (handler->iconv_in != NULL) {
				1990	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				1991	&written, in->content, &toconv);
				1992	xmlBufferShrink(in, toconv);
				1993	out->use += written;
				1994	out->content[out->use] = 0;
				1995	if (ret == -1) ret = -3;
				1996	}
				1997	#endif /* LIBXML_ICONV_ENABLED */
				1998	#ifdef DEBUG_ENCODING
				1999	switch (ret) {
				2000	case 0:
				2001	xmlGenericError(xmlGenericErrorContext,
				2002	"converted %d bytes to %d bytes of input\n",
				2003	toconv, written);
				2004	break;
				2005	case -1:
				2006	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				2007	toconv, written, in->use);
				2008	break;
				2009	case -2:
				2010	xmlGenericError(xmlGenericErrorContext,
				2011	"input conversion failed due to input error\n");
				2012	break;
				2013	case -3:
				2014	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				2015	toconv, written, in->use);
				2016	break;
				2017	default:
				2018	xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
				2019	}
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2020	#endif /* DEBUG_ENCODING */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2021	/*
				2022	* Ignore when input buffer is not on a boundary
				2023	*/
				2024	if (ret == -3) ret = 0;
				2025	if (ret == -1) ret = 0;
				2026	return(ret);
				2027	}
				2028
				2029	/**
				2030	* xmlCharEncInFunc:
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2031	* @handler: char encoding transformation data structure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2032	* @out: an xmlBuffer for the output.
				2033	* @in: an xmlBuffer for the input
				2034	*
				2035	* Generic front-end for the encoding handler input function
				2036	*
				2037	* Returns the number of byte written if success, or
				2038	* -1 general error
				2039	* -2 if the transcoding fails (for *in is not valid utf8 string or
				2040	* the result of transformation can't fit into the encoding we want), or
				2041	*/
				2042	int
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2043	xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
				2044	xmlBufferPtr in)
				2045	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2046	int ret = -2;
				2047	int written;
				2048	int toconv;
				2049
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2050	if (handler == NULL)
				2051	return (-1);
				2052	if (out == NULL)
				2053	return (-1);
				2054	if (in == NULL)
				2055	return (-1);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2056
				2057	toconv = in->use;
				2058	if (toconv == 0)
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2059	return (0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2060	written = out->size - out->use;
				2061	if (toconv * 2 >= written) {
				2062	xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2063	written = out->size - out->use - 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2064	}
				2065	if (handler->input != NULL) {
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2066	ret = handler->input(&out->content[out->use], &written,
				2067	in->content, &toconv);
				2068	xmlBufferShrink(in, toconv);
				2069	out->use += written;
				2070	out->content[out->use] = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2071	}
				2072	#ifdef LIBXML_ICONV_ENABLED
				2073	else if (handler->iconv_in != NULL) {
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2074	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				2075	&written, in->content, &toconv);
				2076	xmlBufferShrink(in, toconv);
				2077	out->use += written;
				2078	out->content[out->use] = 0;
				2079	if (ret == -1)
				2080	ret = -3;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2081	}
				2082	#endif /* LIBXML_ICONV_ENABLED */
				2083	switch (ret) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2084	case 0:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2085	#ifdef DEBUG_ENCODING
				2086	xmlGenericError(xmlGenericErrorContext,
				2087	"converted %d bytes to %d bytes of input\n",
				2088	toconv, written);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2089	#endif
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2090	break;
				2091	case -1:
				2092	#ifdef DEBUG_ENCODING
				2093	xmlGenericError(xmlGenericErrorContext,
				2094	"converted %d bytes to %d bytes of input, %d left\n",
				2095	toconv, written, in->use);
				2096	#endif
				2097	break;
				2098	case -3:
				2099	#ifdef DEBUG_ENCODING
				2100	xmlGenericError(xmlGenericErrorContext,
				2101	"converted %d bytes to %d bytes of input, %d left\n",
				2102	toconv, written, in->use);
				2103	#endif
				2104	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2105	case -2:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2106	xmlGenericError(xmlGenericErrorContext,
				2107	"input conversion failed due to input error\n");
				2108	xmlGenericError(xmlGenericErrorContext,
				2109	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2110	in->content[0], in->content[1],
				2111	in->content[2], in->content[3]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2112	}
				2113	/*
				2114	* Ignore when input buffer is not on a boundary
				2115	*/
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2116	if (ret == -3)
				2117	ret = 0;
				2118	return (ret);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2119	}
				2120
				2121	/**
				2122	* xmlCharEncOutFunc:
				2123	* @handler: char enconding transformation data structure
				2124	* @out: an xmlBuffer for the output.
				2125	* @in: an xmlBuffer for the input
				2126	*
				2127	* Generic front-end for the encoding handler output function
				2128	* a first call with @in == NULL has to be made firs to initiate the
				2129	* output in case of non-stateless encoding needing to initiate their
				2130	* state or the output (like the BOM in UTF16).
				2131	* In case of UTF8 sequence conversion errors for the given encoder,
				2132	* the content will be automatically remapped to a CharRef sequence.
				2133	*
				2134	* Returns the number of byte written if success, or
				2135	* -1 general error
				2136	* -2 if the transcoding fails (for *in is not valid utf8 string or
				2137	* the result of transformation can't fit into the encoding we want), or
				2138	*/
				2139	int
				2140	xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				2141	xmlBufferPtr in) {
				2142	int ret = -2;
				2143	int written;
				2144	int writtentot = 0;
				2145	int toconv;
				2146	int output = 0;
				2147
				2148	if (handler == NULL) return(-1);
				2149	if (out == NULL) return(-1);
				2150
				2151	retry:
				2152
				2153	written = out->size - out->use;
				2154
				2155	/*
				2156	* First specific handling of in = NULL, i.e. the initialization call
				2157	*/
				2158	if (in == NULL) {
				2159	toconv = 0;
				2160	if (handler->output != NULL) {
				2161	ret = handler->output(&out->content[out->use], &written,
				2162	NULL, &toconv);
				2163	out->use += written;
				2164	out->content[out->use] = 0;
				2165	}
				2166	#ifdef LIBXML_ICONV_ENABLED
				2167	else if (handler->iconv_out != NULL) {
				2168	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				2169	&written, NULL, &toconv);
				2170	out->use += written;
				2171	out->content[out->use] = 0;
				2172	}
				2173	#endif /* LIBXML_ICONV_ENABLED */
				2174	#ifdef DEBUG_ENCODING
				2175	xmlGenericError(xmlGenericErrorContext,
				2176	"initialized encoder\n");
				2177	#endif
				2178	return(0);
				2179	}
				2180
				2181	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2182	* Conversion itself.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2183	*/
				2184	toconv = in->use;
				2185	if (toconv == 0)
				2186	return(0);
				2187	if (toconv * 2 >= written) {
				2188	xmlBufferGrow(out, toconv * 2);
				2189	written = out->size - out->use - 1;
				2190	}
				2191	if (handler->output != NULL) {
				2192	ret = handler->output(&out->content[out->use], &written,
				2193	in->content, &toconv);
				2194	xmlBufferShrink(in, toconv);
				2195	out->use += written;
				2196	writtentot += written;
				2197	out->content[out->use] = 0;
				2198	}
				2199	#ifdef LIBXML_ICONV_ENABLED
				2200	else if (handler->iconv_out != NULL) {
				2201	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				2202	&written, in->content, &toconv);
				2203	xmlBufferShrink(in, toconv);
				2204	out->use += written;
				2205	writtentot += written;
				2206	out->content[out->use] = 0;
				2207	if (ret == -1) {
				2208	if (written > 0) {
				2209	/*
				2210	* Can be a limitation of iconv
				2211	*/
				2212	goto retry;
				2213	}
				2214	ret = -3;
				2215	}
				2216	}
				2217	#endif /* LIBXML_ICONV_ENABLED */
				2218	else {
				2219	xmlGenericError(xmlGenericErrorContext,
				2220	"xmlCharEncOutFunc: no output function !\n");
				2221	return(-1);
				2222	}
				2223
				2224	if (ret >= 0) output += ret;
				2225
				2226	/*
				2227	* Attempt to handle error cases
				2228	*/
				2229	switch (ret) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2230	case 0:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2231	#ifdef DEBUG_ENCODING
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2232	xmlGenericError(xmlGenericErrorContext,
				2233	"converted %d bytes to %d bytes of output\n",
				2234	toconv, written);
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2235	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2236	break;
				2237	case -1:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2238	#ifdef DEBUG_ENCODING
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2239	xmlGenericError(xmlGenericErrorContext,
				2240	"output conversion failed by lack of space\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2241	#endif
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2242	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2243	case -3:
				2244	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
				2245	toconv, written, in->use);
				2246	break;
				2247	case -2: {
				2248	int len = in->use;
				2249	const xmlChar utf = (const xmlChar ) in->content;
				2250	int cur;
				2251
				2252	cur = xmlGetUTF8Char(utf, &len);
				2253	if (cur > 0) {
				2254	xmlChar charref[20];
				2255
				2256	#ifdef DEBUG_ENCODING
				2257	xmlGenericError(xmlGenericErrorContext,
				2258	"handling output conversion error\n");
				2259	xmlGenericError(xmlGenericErrorContext,
				2260	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2261	in->content[0], in->content[1],
				2262	in->content[2], in->content[3]);
				2263	#endif
				2264	/*
				2265	* Removes the UTF8 sequence, and replace it by a charref
				2266	* and continue the transcoding phase, hoping the error
				2267	* did not mangle the encoder state.
				2268	*/
Daniel Veillard	1669828	2001-09-14 10:29:27 +0000	[diff] [blame]	2269	sprintf((char *) charref, "&#%d;", cur);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2270	xmlBufferShrink(in, len);
				2271	xmlBufferAddHead(in, charref, -1);
				2272
				2273	goto retry;
				2274	} else {
				2275	xmlGenericError(xmlGenericErrorContext,
				2276	"output conversion failed due to conv error\n");
				2277	xmlGenericError(xmlGenericErrorContext,
				2278	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2279	in->content[0], in->content[1],
				2280	in->content[2], in->content[3]);
				2281	in->content[0] = ' ';
				2282	}
				2283	break;
				2284	}
				2285	}
				2286	return(ret);
				2287	}
				2288
				2289	/**
				2290	* xmlCharEncCloseFunc:
				2291	* @handler: char enconding transformation data structure
				2292	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2293	* Generic front-end for encoding handler close function
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2294	*
				2295	* Returns 0 if success, or -1 in case of error
				2296	*/
				2297	int
				2298	xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
				2299	int ret = 0;
				2300	if (handler == NULL) return(-1);
				2301	if (handler->name == NULL) return(-1);
				2302	#ifdef LIBXML_ICONV_ENABLED
				2303	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2304	* Iconv handlers can be used only once, free the whole block.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2305	* and the associated icon resources.
				2306	*/
				2307	if ((handler->iconv_out != NULL) \|\| (handler->iconv_in != NULL)) {
				2308	if (handler->name != NULL)
				2309	xmlFree(handler->name);
				2310	handler->name = NULL;
				2311	if (handler->iconv_out != NULL) {
				2312	if (iconv_close(handler->iconv_out))
				2313	ret = -1;
				2314	handler->iconv_out = NULL;
				2315	}
				2316	if (handler->iconv_in != NULL) {
				2317	if (iconv_close(handler->iconv_in))
				2318	ret = -1;
				2319	handler->iconv_in = NULL;
				2320	}
				2321	xmlFree(handler);
				2322	}
				2323	#endif /* LIBXML_ICONV_ENABLED */
				2324	#ifdef DEBUG_ENCODING
				2325	if (ret)
				2326	xmlGenericError(xmlGenericErrorContext,
				2327	"failed to close the encoding handler\n");
				2328	else
				2329	xmlGenericError(xmlGenericErrorContext,
				2330	"closed the encoding handler\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2331	#endif
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2332
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2333	return(ret);
				2334	}
				2335