Blame - encoding.c - fp2-dev/platform/external/libxml2

blob: f03285de8cbc2fe1709300b41c80a6dc678918e0 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* encoding.c : implements the encoding conversion functions needed for XML
				3	*
				4	* Related specs:
				5	* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
				6	* rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
				7	* [ISO-10646] UTF-8 and UTF-16 in Annexes
				8	* [ISO-8859-1] ISO Latin-1 characters codes.
				9	* [UNICODE] The Unicode Consortium, "The Unicode Standard --
				10	* Worldwide Character Encoding -- Version 1.0", Addison-
				11	* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
				12	* described in Unicode Technical Report #4.
				13	* [US-ASCII] Coded Character Set--7-bit American Standard Code for
				14	* Information Interchange, ANSI X3.4-1986.
				15	*
				16	* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
				17	*
				18	* See Copyright for the status of this software.
				19	*
				20	* Daniel.Veillard@w3.org
				21	*/
				22
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	23	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	24
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	25	#include <string.h>
				26
				27	#ifdef HAVE_CTYPE_H
				28	#include <ctype.h>
				29	#endif
				30	#ifdef HAVE_STDLIB_H
				31	#include <stdlib.h>
				32	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	33	#ifdef LIBXML_ICONV_ENABLED
				34	#ifdef HAVE_ERRNO_H
				35	#include <errno.h>
				36	#endif
				37	#endif
				38	#include <libxml/encoding.h>
				39	#include <libxml/xmlmemory.h>
				40	#ifdef LIBXML_HTML_ENABLED
				41	#include <libxml/HTMLparser.h>
				42	#endif
				43	#include <libxml/xmlerror.h>
				44
				45	xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
				46	xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
				47
				48	typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
				49	typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
				50	struct _xmlCharEncodingAlias {
				51	const char *name;
				52	const char *alias;
				53	};
				54
				55	static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
				56	static int xmlCharEncodingAliasesNb = 0;
				57	static int xmlCharEncodingAliasesMax = 0;
				58
				59	#ifdef LIBXML_ICONV_ENABLED
				60	#if 0
				61	#define DEBUG_ENCODING /* Define this to get encoding traces */
				62	#endif
				63	#endif
				64
				65	static int xmlLittleEndian = 1;
				66
				67	/*
				68	* From rfc2044: encoding of the Unicode values on UTF-8:
				69	*
				70	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				71	* 0000 0000-0000 007F 0xxxxxxx
				72	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				73	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				74	*
				75	* I hope we won't use values > 0xFFFF anytime soon !
				76	*/
				77
				78	/**
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	79	* xmlUTF8Strlen:
				80	* @utf: a sequence of UTF-8 encoded bytes
				81	*
				82	* compute the lenght of an UTF8 string, it doesn't do a full UTF8
				83	* checking of the content of the string.
				84	*
				85	* Returns the number of characters in the string or -1 in case of error
				86	*/
				87	int
				88	xmlUTF8Strlen(const unsigned char *utf) {
				89	int ret = 0;
				90
				91	if (utf == NULL)
				92	return(-1);
				93
				94	while (*utf != 0) {
				95	if (utf[0] & 0x80) {
				96	if ((utf[1] & 0xc0) != 0x80)
				97	return(-1);
				98	if ((utf[0] & 0xe0) == 0xe0) {
				99	if ((utf[2] & 0xc0) != 0x80)
				100	return(-1);
				101	if ((utf[0] & 0xf0) == 0xf0) {
				102	if ((utf[0] & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				103	return(-1);
				104	utf += 4;
				105	} else {
				106	utf += 3;
				107	}
				108	} else {
				109	utf += 2;
				110	}
				111	} else {
				112	utf++;
				113	}
				114	ret++;
				115	}
				116	return(ret);
				117	}
				118
				119	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	120	* xmlGetUTF8Char:
				121	* @utf: a sequence of UTF-8 encoded bytes
				122	* @len: a pointer to @bytes len
				123	*
				124	* Read one UTF8 Char from @utf
				125	*
				126	* Returns the char value or -1 in case of error and update @len with the
				127	* number of bytes used
				128	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	129	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	130	xmlGetUTF8Char(const unsigned char utf, int len) {
				131	unsigned int c;
				132
				133	if (utf == NULL)
				134	goto error;
				135	if (len == NULL)
				136	goto error;
				137	if (*len < 1)
				138	goto error;
				139
				140	c = utf[0];
				141	if (c & 0x80) {
				142	if (*len < 2)
				143	goto error;
				144	if ((utf[1] & 0xc0) != 0x80)
				145	goto error;
				146	if ((c & 0xe0) == 0xe0) {
				147	if (*len < 3)
				148	goto error;
				149	if ((utf[2] & 0xc0) != 0x80)
				150	goto error;
				151	if ((c & 0xf0) == 0xf0) {
				152	if (*len < 4)
				153	goto error;
				154	if ((c & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				155	goto error;
				156	*len = 4;
				157	/* 4-byte code */
				158	c = (utf[0] & 0x7) << 18;
				159	c \|= (utf[1] & 0x3f) << 12;
				160	c \|= (utf[2] & 0x3f) << 6;
				161	c \|= utf[3] & 0x3f;
				162	} else {
				163	/* 3-byte code */
				164	*len = 3;
				165	c = (utf[0] & 0xf) << 12;
				166	c \|= (utf[1] & 0x3f) << 6;
				167	c \|= utf[2] & 0x3f;
				168	}
				169	} else {
				170	/* 2-byte code */
				171	*len = 2;
				172	c = (utf[0] & 0x1f) << 6;
				173	c \|= utf[1] & 0x3f;
				174	}
				175	} else {
				176	/* 1-byte code */
				177	*len = 1;
				178	}
				179	return(c);
				180
				181	error:
				182	*len = 0;
				183	return(-1);
				184	}
				185
				186	/**
				187	* xmlCheckUTF8: Check utf-8 string for legality.
				188	* @utf: Pointer to putative utf-8 encoded string.
				189	*
				190	* Checks @utf for being valid utf-8. @utf is assumed to be
				191	* null-terminated. This function is not super-strict, as it will
				192	* allow longer utf-8 sequences than necessary. Note that Java is
				193	* capable of producing these sequences if provoked. Also note, this
				194	* routine checks for the 4-byte maxiumum size, but does not check for
				195	* 0x10ffff maximum value.
				196	*
				197	* Return value: true if @utf is valid.
				198	**/
				199	int
				200	xmlCheckUTF8(const unsigned char *utf)
				201	{
				202	int ix;
				203	unsigned char c;
				204
				205	for (ix = 0; (c = utf[ix]);) {
				206	if (c & 0x80) {
				207	if ((utf[ix + 1] & 0xc0) != 0x80)
				208	return(0);
				209	if ((c & 0xe0) == 0xe0) {
				210	if ((utf[ix + 2] & 0xc0) != 0x80)
				211	return(0);
				212	if ((c & 0xf0) == 0xf0) {
				213	if ((c & 0xf8) != 0xf0 \|\| (utf[ix + 3] & 0xc0) != 0x80)
				214	return(0);
				215	ix += 4;
				216	/* 4-byte code */
				217	} else
				218	/* 3-byte code */
				219	ix += 3;
				220	} else
				221	/* 2-byte code */
				222	ix += 2;
				223	} else
				224	/* 1-byte code */
				225	ix++;
				226	}
				227	return(1);
				228	}
				229
				230	/**
				231	* asciiToUTF8:
				232	* @out: a pointer to an array of bytes to store the result
				233	* @outlen: the length of @out
				234	* @in: a pointer to an array of ASCII chars
				235	* @inlen: the length of @in
				236	*
				237	* Take a block of ASCII chars in and try to convert it to an UTF-8
				238	* block of chars out.
				239	* Returns 0 if success, or -1 otherwise
				240	* The value of @inlen after return is the number of octets consumed
				241	* as the return value is positive, else unpredictiable.
				242	* The value of @outlen after return is the number of ocetes consumed.
				243	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	244	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	245	asciiToUTF8(unsigned char* out, int *outlen,
				246	const unsigned char* in, int *inlen) {
				247	unsigned char* outstart = out;
				248	const unsigned char* base = in;
				249	const unsigned char* processed = in;
				250	unsigned char* outend = out + *outlen;
				251	const unsigned char* inend;
				252	unsigned int c;
				253	int bits;
				254
				255	inend = in + (*inlen);
				256	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				257	c= *in++;
				258
				259	/* assertion: c is a single UTF-4 value */
				260	if (out >= outend)
				261	break;
				262	if (c < 0x80) { *out++= c; bits= -6; }
				263	else {
				264	*outlen = out - outstart;
				265	*inlen = processed - base;
				266	return(-1);
				267	}
				268
				269	for ( ; bits >= 0; bits-= 6) {
				270	if (out >= outend)
				271	break;
				272	*out++= ((c >> bits) & 0x3F) \| 0x80;
				273	}
				274	processed = (const unsigned char*) in;
				275	}
				276	*outlen = out - outstart;
				277	*inlen = processed - base;
				278	return(0);
				279	}
				280
				281	/**
				282	* UTF8Toascii:
				283	* @out: a pointer to an array of bytes to store the result
				284	* @outlen: the length of @out
				285	* @in: a pointer to an array of UTF-8 chars
				286	* @inlen: the length of @in
				287	*
				288	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				289	* block of chars out.
				290	*
				291	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				292	* The value of @inlen after return is the number of octets consumed
				293	* as the return value is positive, else unpredictiable.
				294	* The value of @outlen after return is the number of ocetes consumed.
				295	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	296	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	297	UTF8Toascii(unsigned char* out, int *outlen,
				298	const unsigned char* in, int *inlen) {
				299	const unsigned char* processed = in;
				300	const unsigned char* outend;
				301	const unsigned char* outstart = out;
				302	const unsigned char* instart = in;
				303	const unsigned char* inend;
				304	unsigned int c, d;
				305	int trailing;
				306
				307	if (in == NULL) {
				308	/*
				309	* initialization nothing to do
				310	*/
				311	*outlen = 0;
				312	*inlen = 0;
				313	return(0);
				314	}
				315	inend = in + (*inlen);
				316	outend = out + (*outlen);
				317	while (in < inend) {
				318	d = *in++;
				319	if (d < 0x80) { c= d; trailing= 0; }
				320	else if (d < 0xC0) {
				321	/* trailing byte in leading position */
				322	*outlen = out - outstart;
				323	*inlen = processed - instart;
				324	return(-2);
				325	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				326	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				327	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				328	else {
				329	/* no chance for this in Ascii */
				330	*outlen = out - outstart;
				331	*inlen = processed - instart;
				332	return(-2);
				333	}
				334
				335	if (inend - in < trailing) {
				336	break;
				337	}
				338
				339	for ( ; trailing; trailing--) {
				340	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				341	break;
				342	c <<= 6;
				343	c \|= d & 0x3F;
				344	}
				345
				346	/* assertion: c is a single UTF-4 value */
				347	if (c < 0x80) {
				348	if (out >= outend)
				349	break;
				350	*out++ = c;
				351	} else {
				352	/* no chance for this in Ascii */
				353	*outlen = out - outstart;
				354	*inlen = processed - instart;
				355	return(-2);
				356	}
				357	processed = in;
				358	}
				359	*outlen = out - outstart;
				360	*inlen = processed - instart;
				361	return(0);
				362	}
				363
				364	/**
				365	* isolat1ToUTF8:
				366	* @out: a pointer to an array of bytes to store the result
				367	* @outlen: the length of @out
				368	* @in: a pointer to an array of ISO Latin 1 chars
				369	* @inlen: the length of @in
				370	*
				371	* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
				372	* block of chars out.
				373	* Returns 0 if success, or -1 otherwise
				374	* The value of @inlen after return is the number of octets consumed
				375	* as the return value is positive, else unpredictiable.
				376	* The value of @outlen after return is the number of ocetes consumed.
				377	*/
				378	int
				379	isolat1ToUTF8(unsigned char* out, int *outlen,
				380	const unsigned char* in, int *inlen) {
				381	unsigned char* outstart = out;
				382	const unsigned char* base = in;
				383	const unsigned char* processed = in;
				384	unsigned char* outend = out + *outlen;
				385	const unsigned char* inend;
				386	unsigned int c;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	387
				388	inend = in + (*inlen);
Daniel Veillard	02141ea	2001-04-30 11:46:40 +0000	[diff] [blame^]	389	while (in < inend) {
				390	c = *in++;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	391
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	392	if (out >= outend)
				393	break;
Daniel Veillard	02141ea	2001-04-30 11:46:40 +0000	[diff] [blame^]	394
				395	if (c < 0x80) {
				396	*out++ = c;
				397	processed++;
				398	continue;
				399	} else {
				400	*out++= ((c >> 6) & 0x1F) \| 0xC0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	401	if (out >= outend)
Daniel Veillard	02141ea	2001-04-30 11:46:40 +0000	[diff] [blame^]	402	break;
				403	*out++= (c & 0x3F) \| 0x80;
				404	processed++;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	405	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	406	}
				407	*outlen = out - outstart;
				408	*inlen = processed - base;
				409	return(0);
				410	}
				411
				412	/**
				413	* UTF8Toisolat1:
				414	* @out: a pointer to an array of bytes to store the result
				415	* @outlen: the length of @out
				416	* @in: a pointer to an array of UTF-8 chars
				417	* @inlen: the length of @in
				418	*
				419	* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
				420	* block of chars out.
				421	*
				422	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				423	* The value of @inlen after return is the number of octets consumed
				424	* as the return value is positive, else unpredictiable.
				425	* The value of @outlen after return is the number of ocetes consumed.
				426	*/
				427	int
				428	UTF8Toisolat1(unsigned char* out, int *outlen,
				429	const unsigned char* in, int *inlen) {
				430	const unsigned char* processed = in;
				431	const unsigned char* outend;
				432	const unsigned char* outstart = out;
				433	const unsigned char* instart = in;
				434	const unsigned char* inend;
				435	unsigned int c, d;
				436	int trailing;
				437
				438	if (in == NULL) {
				439	/*
				440	* initialization nothing to do
				441	*/
				442	*outlen = 0;
				443	*inlen = 0;
				444	return(0);
				445	}
				446	inend = in + (*inlen);
				447	outend = out + (*outlen);
				448	while (in < inend) {
				449	d = *in++;
				450	if (d < 0x80) { c= d; trailing= 0; }
				451	else if (d < 0xC0) {
				452	/* trailing byte in leading position */
				453	*outlen = out - outstart;
				454	*inlen = processed - instart;
				455	return(-2);
				456	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				457	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				458	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				459	else {
				460	/* no chance for this in IsoLat1 */
				461	*outlen = out - outstart;
				462	*inlen = processed - instart;
				463	return(-2);
				464	}
				465
				466	if (inend - in < trailing) {
				467	break;
				468	}
				469
				470	for ( ; trailing; trailing--) {
				471	if (in >= inend)
				472	break;
				473	if (((d= *in++) & 0xC0) != 0x80) {
				474	*outlen = out - outstart;
				475	*inlen = processed - instart;
				476	return(-2);
				477	}
				478	c <<= 6;
				479	c \|= d & 0x3F;
				480	}
				481
				482	/* assertion: c is a single UTF-4 value */
				483	if (c <= 0xFF) {
				484	if (out >= outend)
				485	break;
				486	*out++ = c;
				487	} else {
				488	/* no chance for this in IsoLat1 */
				489	*outlen = out - outstart;
				490	*inlen = processed - instart;
				491	return(-2);
				492	}
				493	processed = in;
				494	}
				495	*outlen = out - outstart;
				496	*inlen = processed - instart;
				497	return(0);
				498	}
				499
				500	/**
				501	* UTF16LEToUTF8:
				502	* @out: a pointer to an array of bytes to store the result
				503	* @outlen: the length of @out
				504	* @inb: a pointer to an array of UTF-16LE passwd as a byte array
				505	* @inlenb: the length of @in in UTF-16LE chars
				506	*
				507	* Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
				508	* block of chars out. This function assume the endian properity
				509	* is the same between the native type of this machine and the
				510	* inputed one.
				511	*
				512	* Returns the number of byte written, or -1 by lack of space, or -2
				513	* if the transcoding fails (for *in is not valid utf16 string)
				514	* The value of *inlen after return is the number of octets consumed
				515	* as the return value is positive, else unpredictiable.
				516	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	517	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	518	UTF16LEToUTF8(unsigned char* out, int *outlen,
				519	const unsigned char* inb, int *inlenb)
				520	{
				521	unsigned char* outstart = out;
				522	const unsigned char* processed = inb;
				523	unsigned char* outend = out + *outlen;
				524	unsigned short* in = (unsigned short*) inb;
				525	unsigned short* inend;
				526	unsigned int c, d, inlen;
				527	unsigned char *tmp;
				528	int bits;
				529
				530	if ((*inlenb % 2) == 1)
				531	(*inlenb)--;
				532	inlen = *inlenb / 2;
				533	inend = in + inlen;
				534	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				535	if (xmlLittleEndian) {
				536	c= *in++;
				537	} else {
				538	tmp = (unsigned char *) in;
				539	c = *tmp++;
				540	c = c \| (((unsigned int)*tmp) << 8);
				541	in++;
				542	}
				543	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				544	if (in >= inend) { /* (in > inend) shouldn't happens */
				545	break;
				546	}
				547	if (xmlLittleEndian) {
				548	d = *in++;
				549	} else {
				550	tmp = (unsigned char *) in;
				551	d = *tmp++;
				552	d = d \| (((unsigned int)*tmp) << 8);
				553	in++;
				554	}
				555	if ((d & 0xFC00) == 0xDC00) {
				556	c &= 0x03FF;
				557	c <<= 10;
				558	c \|= d & 0x03FF;
				559	c += 0x10000;
				560	}
				561	else {
				562	*outlen = out - outstart;
				563	*inlenb = processed - inb;
				564	return(-2);
				565	}
				566	}
				567
				568	/* assertion: c is a single UTF-4 value */
				569	if (out >= outend)
				570	break;
				571	if (c < 0x80) { *out++= c; bits= -6; }
				572	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				573	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				574	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				575
				576	for ( ; bits >= 0; bits-= 6) {
				577	if (out >= outend)
				578	break;
				579	*out++= ((c >> bits) & 0x3F) \| 0x80;
				580	}
				581	processed = (const unsigned char*) in;
				582	}
				583	*outlen = out - outstart;
				584	*inlenb = processed - inb;
				585	return(0);
				586	}
				587
				588	/**
				589	* UTF8ToUTF16LE:
				590	* @outb: a pointer to an array of bytes to store the result
				591	* @outlen: the length of @outb
				592	* @in: a pointer to an array of UTF-8 chars
				593	* @inlen: the length of @in
				594	*
				595	* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
				596	* block of chars out.
				597	*
				598	* Returns the number of byte written, or -1 by lack of space, or -2
				599	* if the transcoding failed.
				600	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	601	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	602	UTF8ToUTF16LE(unsigned char* outb, int *outlen,
				603	const unsigned char* in, int *inlen)
				604	{
				605	unsigned short* out = (unsigned short*) outb;
				606	const unsigned char* processed = in;
				607	unsigned short* outstart= out;
				608	unsigned short* outend;
				609	const unsigned char* inend= in+*inlen;
				610	unsigned int c, d;
				611	int trailing;
				612	unsigned char *tmp;
				613	unsigned short tmp1, tmp2;
				614
				615	if (in == NULL) {
				616	/*
				617	* initialization, add the Byte Order Mark
				618	*/
				619	if (*outlen >= 2) {
				620	outb[0] = 0xFF;
				621	outb[1] = 0xFE;
				622	*outlen = 2;
				623	*inlen = 0;
				624	#ifdef DEBUG_ENCODING
				625	xmlGenericError(xmlGenericErrorContext,
				626	"Added FFFE Byte Order Mark\n");
				627	#endif
				628	return(2);
				629	}
				630	*outlen = 0;
				631	*inlen = 0;
				632	return(0);
				633	}
				634	outend = out + (*outlen / 2);
				635	while (in < inend) {
				636	d= *in++;
				637	if (d < 0x80) { c= d; trailing= 0; }
				638	else if (d < 0xC0) {
				639	/* trailing byte in leading position */
				640	outlen = (out - outstart) 2;
				641	*inlen = processed - in;
				642	return(-2);
				643	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				644	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				645	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				646	else {
				647	/* no chance for this in UTF-16 */
				648	outlen = (out - outstart) 2;
				649	*inlen = processed - in;
				650	return(-2);
				651	}
				652
				653	if (inend - in < trailing) {
				654	break;
				655	}
				656
				657	for ( ; trailing; trailing--) {
				658	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				659	break;
				660	c <<= 6;
				661	c \|= d & 0x3F;
				662	}
				663
				664	/* assertion: c is a single UTF-4 value */
				665	if (c < 0x10000) {
				666	if (out >= outend)
				667	break;
				668	if (xmlLittleEndian) {
				669	*out++ = c;
				670	} else {
				671	tmp = (unsigned char *) out;
				672	*tmp = c ;
				673	*(tmp + 1) = c >> 8 ;
				674	out++;
				675	}
				676	}
				677	else if (c < 0x110000) {
				678	if (out+1 >= outend)
				679	break;
				680	c -= 0x10000;
				681	if (xmlLittleEndian) {
				682	*out++ = 0xD800 \| (c >> 10);
				683	*out++ = 0xDC00 \| (c & 0x03FF);
				684	} else {
				685	tmp1 = 0xD800 \| (c >> 10);
				686	tmp = (unsigned char *) out;
				687	*tmp = (unsigned char) tmp1;
				688	*(tmp + 1) = tmp1 >> 8;
				689	out++;
				690
				691	tmp2 = 0xDC00 \| (c & 0x03FF);
				692	tmp = (unsigned char *) out;
				693	*tmp = (unsigned char) tmp2;
				694	*(tmp + 1) = tmp2 >> 8;
				695	out++;
				696	}
				697	}
				698	else
				699	break;
				700	processed = in;
				701	}
				702	outlen = (out - outstart) 2;
				703	*inlen = processed - in;
				704	return(0);
				705	}
				706
				707	/**
				708	* UTF16BEToUTF8:
				709	* @out: a pointer to an array of bytes to store the result
				710	* @outlen: the length of @out
				711	* @inb: a pointer to an array of UTF-16 passwd as a byte array
				712	* @inlenb: the length of @in in UTF-16 chars
				713	*
				714	* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
				715	* block of chars out. This function assume the endian properity
				716	* is the same between the native type of this machine and the
				717	* inputed one.
				718	*
				719	* Returns the number of byte written, or -1 by lack of space, or -2
				720	* if the transcoding fails (for *in is not valid utf16 string)
				721	* The value of *inlen after return is the number of octets consumed
				722	* as the return value is positive, else unpredictiable.
				723	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	724	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	725	UTF16BEToUTF8(unsigned char* out, int *outlen,
				726	const unsigned char* inb, int *inlenb)
				727	{
				728	unsigned char* outstart = out;
				729	const unsigned char* processed = inb;
				730	unsigned char* outend = out + *outlen;
				731	unsigned short* in = (unsigned short*) inb;
				732	unsigned short* inend;
				733	unsigned int c, d, inlen;
				734	unsigned char *tmp;
				735	int bits;
				736
				737	if ((*inlenb % 2) == 1)
				738	(*inlenb)--;
				739	inlen = *inlenb / 2;
				740	inend= in + inlen;
				741	while (in < inend) {
				742	if (xmlLittleEndian) {
				743	tmp = (unsigned char *) in;
				744	c = *tmp++;
				745	c = c << 8;
				746	c = c \| (unsigned int) *tmp;
				747	in++;
				748	} else {
				749	c= *in++;
				750	}
				751	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				752	if (in >= inend) { /* (in > inend) shouldn't happens */
				753	*outlen = out - outstart;
				754	*inlenb = processed - inb;
				755	return(-2);
				756	}
				757	if (xmlLittleEndian) {
				758	tmp = (unsigned char *) in;
				759	d = *tmp++;
				760	d = d << 8;
				761	d = d \| (unsigned int) *tmp;
				762	in++;
				763	} else {
				764	d= *in++;
				765	}
				766	if ((d & 0xFC00) == 0xDC00) {
				767	c &= 0x03FF;
				768	c <<= 10;
				769	c \|= d & 0x03FF;
				770	c += 0x10000;
				771	}
				772	else {
				773	*outlen = out - outstart;
				774	*inlenb = processed - inb;
				775	return(-2);
				776	}
				777	}
				778
				779	/* assertion: c is a single UTF-4 value */
				780	if (out >= outend)
				781	break;
				782	if (c < 0x80) { *out++= c; bits= -6; }
				783	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				784	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				785	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				786
				787	for ( ; bits >= 0; bits-= 6) {
				788	if (out >= outend)
				789	break;
				790	*out++= ((c >> bits) & 0x3F) \| 0x80;
				791	}
				792	processed = (const unsigned char*) in;
				793	}
				794	*outlen = out - outstart;
				795	*inlenb = processed - inb;
				796	return(0);
				797	}
				798
				799	/**
				800	* UTF8ToUTF16BE:
				801	* @outb: a pointer to an array of bytes to store the result
				802	* @outlen: the length of @outb
				803	* @in: a pointer to an array of UTF-8 chars
				804	* @inlen: the length of @in
				805	*
				806	* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
				807	* block of chars out.
				808	*
				809	* Returns the number of byte written, or -1 by lack of space, or -2
				810	* if the transcoding failed.
				811	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	812	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	813	UTF8ToUTF16BE(unsigned char* outb, int *outlen,
				814	const unsigned char* in, int *inlen)
				815	{
				816	unsigned short* out = (unsigned short*) outb;
				817	const unsigned char* processed = in;
				818	unsigned short* outstart= out;
				819	unsigned short* outend;
				820	const unsigned char* inend= in+*inlen;
				821	unsigned int c, d;
				822	int trailing;
				823	unsigned char *tmp;
				824	unsigned short tmp1, tmp2;
				825
				826	if (in == NULL) {
				827	/*
				828	* initialization, add the Byte Order Mark
				829	*/
				830	if (*outlen >= 2) {
				831	outb[0] = 0xFE;
				832	outb[1] = 0xFF;
				833	*outlen = 2;
				834	*inlen = 0;
				835	#ifdef DEBUG_ENCODING
				836	xmlGenericError(xmlGenericErrorContext,
				837	"Added FEFF Byte Order Mark\n");
				838	#endif
				839	return(2);
				840	}
				841	*outlen = 0;
				842	*inlen = 0;
				843	return(0);
				844	}
				845	outend = out + (*outlen / 2);
				846	while (in < inend) {
				847	d= *in++;
				848	if (d < 0x80) { c= d; trailing= 0; }
				849	else if (d < 0xC0) {
				850	/* trailing byte in leading position */
				851	*outlen = out - outstart;
				852	*inlen = processed - in;
				853	return(-2);
				854	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				855	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				856	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				857	else {
				858	/* no chance for this in UTF-16 */
				859	*outlen = out - outstart;
				860	*inlen = processed - in;
				861	return(-2);
				862	}
				863
				864	if (inend - in < trailing) {
				865	break;
				866	}
				867
				868	for ( ; trailing; trailing--) {
				869	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80)) break;
				870	c <<= 6;
				871	c \|= d & 0x3F;
				872	}
				873
				874	/* assertion: c is a single UTF-4 value */
				875	if (c < 0x10000) {
				876	if (out >= outend) break;
				877	if (xmlLittleEndian) {
				878	tmp = (unsigned char *) out;
				879	*tmp = c >> 8;
				880	*(tmp + 1) = c;
				881	out++;
				882	} else {
				883	*out++ = c;
				884	}
				885	}
				886	else if (c < 0x110000) {
				887	if (out+1 >= outend) break;
				888	c -= 0x10000;
				889	if (xmlLittleEndian) {
				890	tmp1 = 0xD800 \| (c >> 10);
				891	tmp = (unsigned char *) out;
				892	*tmp = tmp1 >> 8;
				893	*(tmp + 1) = (unsigned char) tmp1;
				894	out++;
				895
				896	tmp2 = 0xDC00 \| (c & 0x03FF);
				897	tmp = (unsigned char *) out;
				898	*tmp = tmp2 >> 8;
				899	*(tmp + 1) = (unsigned char) tmp2;
				900	out++;
				901	} else {
				902	*out++ = 0xD800 \| (c >> 10);
				903	*out++ = 0xDC00 \| (c & 0x03FF);
				904	}
				905	}
				906	else
				907	break;
				908	processed = in;
				909	}
				910	outlen = (out - outstart) 2;
				911	*inlen = processed - in;
				912	return(0);
				913	}
				914
				915	/**
				916	* xmlDetectCharEncoding:
				917	* @in: a pointer to the first bytes of the XML entity, must be at least
				918	* 4 bytes long.
				919	* @len: pointer to the length of the buffer
				920	*
				921	* Guess the encoding of the entity using the first bytes of the entity content
				922	* accordingly of the non-normative appendix F of the XML-1.0 recommendation.
				923	*
				924	* Returns one of the XML_CHAR_ENCODING_... values.
				925	*/
				926	xmlCharEncoding
				927	xmlDetectCharEncoding(const unsigned char* in, int len)
				928	{
				929	if (len >= 4) {
				930	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				931	(in[2] == 0x00) && (in[3] == 0x3C))
				932	return(XML_CHAR_ENCODING_UCS4BE);
				933	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
				934	(in[2] == 0x00) && (in[3] == 0x00))
				935	return(XML_CHAR_ENCODING_UCS4LE);
				936	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				937	(in[2] == 0x3C) && (in[3] == 0x00))
				938	return(XML_CHAR_ENCODING_UCS4_2143);
				939	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
				940	(in[2] == 0x00) && (in[3] == 0x00))
				941	return(XML_CHAR_ENCODING_UCS4_3412);
				942	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
				943	(in[2] == 0xA7) && (in[3] == 0x94))
				944	return(XML_CHAR_ENCODING_EBCDIC);
				945	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
				946	(in[2] == 0x78) && (in[3] == 0x6D))
				947	return(XML_CHAR_ENCODING_UTF8);
				948	}
				949	if (len >= 2) {
				950	if ((in[0] == 0xFE) && (in[1] == 0xFF))
				951	return(XML_CHAR_ENCODING_UTF16BE);
				952	if ((in[0] == 0xFF) && (in[1] == 0xFE))
				953	return(XML_CHAR_ENCODING_UTF16LE);
				954	}
				955	return(XML_CHAR_ENCODING_NONE);
				956	}
				957
				958	/**
				959	* xmlCleanupEncodingAliases:
				960	*
				961	* Unregisters all aliases
				962	*/
				963	void
				964	xmlCleanupEncodingAliases(void) {
				965	int i;
				966
				967	if (xmlCharEncodingAliases == NULL)
				968	return;
				969
				970	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				971	if (xmlCharEncodingAliases[i].name != NULL)
				972	xmlFree((char *) xmlCharEncodingAliases[i].name);
				973	if (xmlCharEncodingAliases[i].alias != NULL)
				974	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				975	}
				976	xmlCharEncodingAliasesNb = 0;
				977	xmlCharEncodingAliasesMax = 0;
				978	xmlFree(xmlCharEncodingAliases);
				979	}
				980
				981	/**
				982	* xmlGetEncodingAlias:
				983	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				984	*
				985	* Lookup an encoding name for the given alias.
				986	*
				987	* Returns NULL if not found the original name otherwise
				988	*/
				989	const char *
				990	xmlGetEncodingAlias(const char *alias) {
				991	int i;
				992	char upper[100];
				993
				994	if (alias == NULL)
				995	return(NULL);
				996
				997	if (xmlCharEncodingAliases == NULL)
				998	return(NULL);
				999
				1000	for (i = 0;i < 99;i++) {
				1001	upper[i] = toupper(alias[i]);
				1002	if (upper[i] == 0) break;
				1003	}
				1004	upper[i] = 0;
				1005
				1006	/*
				1007	* Walk down the list looking for a definition of the alias
				1008	*/
				1009	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1010	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1011	return(xmlCharEncodingAliases[i].name);
				1012	}
				1013	}
				1014	return(NULL);
				1015	}
				1016
				1017	/**
				1018	* xmlAddEncodingAlias:
				1019	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1020	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1021	*
				1022	* Registers and alias @alias for an encoding named @name. Existing alias
				1023	* will be overwritten.
				1024	*
				1025	* Returns 0 in case of success, -1 in case of error
				1026	*/
				1027	int
				1028	xmlAddEncodingAlias(const char name, const char alias) {
				1029	int i;
				1030	char upper[100];
				1031
				1032	if ((name == NULL) \|\| (alias == NULL))
				1033	return(-1);
				1034
				1035	for (i = 0;i < 99;i++) {
				1036	upper[i] = toupper(alias[i]);
				1037	if (upper[i] == 0) break;
				1038	}
				1039	upper[i] = 0;
				1040
				1041	if (xmlCharEncodingAliases == NULL) {
				1042	xmlCharEncodingAliasesNb = 0;
				1043	xmlCharEncodingAliasesMax = 20;
				1044	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1045	xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1046	if (xmlCharEncodingAliases == NULL)
				1047	return(-1);
				1048	} else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
				1049	xmlCharEncodingAliasesMax *= 2;
				1050	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1051	xmlRealloc(xmlCharEncodingAliases,
				1052	xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1053	}
				1054	/*
				1055	* Walk down the list looking for a definition of the alias
				1056	*/
				1057	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1058	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1059	/*
				1060	* Replace the definition.
				1061	*/
				1062	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1063	xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
				1064	return(0);
				1065	}
				1066	}
				1067	/*
				1068	* Add the definition
				1069	*/
				1070	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
				1071	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
				1072	xmlCharEncodingAliasesNb++;
				1073	return(0);
				1074	}
				1075
				1076	/**
				1077	* xmlDelEncodingAlias:
				1078	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1079	*
				1080	* Unregisters an encoding alias @alias
				1081	*
				1082	* Returns 0 in case of success, -1 in case of error
				1083	*/
				1084	int
				1085	xmlDelEncodingAlias(const char *alias) {
				1086	int i;
				1087
				1088	if (alias == NULL)
				1089	return(-1);
				1090
				1091	if (xmlCharEncodingAliases == NULL)
				1092	return(-1);
				1093	/*
				1094	* Walk down the list looking for a definition of the alias
				1095	*/
				1096	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1097	if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
				1098	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1099	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				1100	xmlCharEncodingAliasesNb--;
				1101	memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
				1102	sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
				1103	return(0);
				1104	}
				1105	}
				1106	return(-1);
				1107	}
				1108
				1109	/**
				1110	* xmlParseCharEncoding:
				1111	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1112	*
				1113	* Conpare the string to the known encoding schemes already known. Note
				1114	* that the comparison is case insensitive accordingly to the section
				1115	* [XML] 4.3.3 Character Encoding in Entities.
				1116	*
				1117	* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
				1118	* if not recognized.
				1119	*/
				1120	xmlCharEncoding
				1121	xmlParseCharEncoding(const char* name)
				1122	{
				1123	const char *alias;
				1124	char upper[500];
				1125	int i;
				1126
				1127	if (name == NULL)
				1128	return(XML_CHAR_ENCODING_NONE);
				1129
				1130	/*
				1131	* Do the alias resolution
				1132	*/
				1133	alias = xmlGetEncodingAlias(name);
				1134	if (alias != NULL)
				1135	name = alias;
				1136
				1137	for (i = 0;i < 499;i++) {
				1138	upper[i] = toupper(name[i]);
				1139	if (upper[i] == 0) break;
				1140	}
				1141	upper[i] = 0;
				1142
				1143	if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
				1144	if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
				1145	if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
				1146
				1147	/*
				1148	* NOTE: if we were able to parse this, the endianness of UTF16 is
				1149	* already found and in use
				1150	*/
				1151	if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
				1152	if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
				1153
				1154	if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1155	if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1156	if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
				1157
				1158	/*
				1159	* NOTE: if we were able to parse this, the endianness of UCS4 is
				1160	* already found and in use
				1161	*/
				1162	if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1163	if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1164	if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
				1165
				1166
				1167	if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
				1168	if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
				1169	if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
				1170
				1171	if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
				1172	if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
				1173	if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
				1174
				1175	if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
				1176	if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
				1177	if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
				1178	if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
				1179	if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
				1180	if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
				1181	if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
				1182
				1183	if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
				1184	if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
				1185	if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
				1186
				1187	#ifdef DEBUG_ENCODING
				1188	xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
				1189	#endif
				1190	return(XML_CHAR_ENCODING_ERROR);
				1191	}
				1192
				1193	/**
				1194	* xmlGetCharEncodingName:
				1195	* @enc: the encoding
				1196	*
				1197	* The "canonical" name for XML encoding.
				1198	* C.f. http://www.w3.org/TR/REC-xml#charencoding
				1199	* Section 4.3.3 Character Encoding in Entities
				1200	*
				1201	* Returns the canonical name for the given encoding
				1202	*/
				1203
				1204	const char*
				1205	xmlGetCharEncodingName(xmlCharEncoding enc) {
				1206	switch (enc) {
				1207	case XML_CHAR_ENCODING_ERROR:
				1208	return(NULL);
				1209	case XML_CHAR_ENCODING_NONE:
				1210	return(NULL);
				1211	case XML_CHAR_ENCODING_UTF8:
				1212	return("UTF-8");
				1213	case XML_CHAR_ENCODING_UTF16LE:
				1214	return("UTF-16");
				1215	case XML_CHAR_ENCODING_UTF16BE:
				1216	return("UTF-16");
				1217	case XML_CHAR_ENCODING_EBCDIC:
				1218	return("EBCDIC");
				1219	case XML_CHAR_ENCODING_UCS4LE:
				1220	return("ISO-10646-UCS-4");
				1221	case XML_CHAR_ENCODING_UCS4BE:
				1222	return("ISO-10646-UCS-4");
				1223	case XML_CHAR_ENCODING_UCS4_2143:
				1224	return("ISO-10646-UCS-4");
				1225	case XML_CHAR_ENCODING_UCS4_3412:
				1226	return("ISO-10646-UCS-4");
				1227	case XML_CHAR_ENCODING_UCS2:
				1228	return("ISO-10646-UCS-2");
				1229	case XML_CHAR_ENCODING_8859_1:
				1230	return("ISO-8859-1");
				1231	case XML_CHAR_ENCODING_8859_2:
				1232	return("ISO-8859-2");
				1233	case XML_CHAR_ENCODING_8859_3:
				1234	return("ISO-8859-3");
				1235	case XML_CHAR_ENCODING_8859_4:
				1236	return("ISO-8859-4");
				1237	case XML_CHAR_ENCODING_8859_5:
				1238	return("ISO-8859-5");
				1239	case XML_CHAR_ENCODING_8859_6:
				1240	return("ISO-8859-6");
				1241	case XML_CHAR_ENCODING_8859_7:
				1242	return("ISO-8859-7");
				1243	case XML_CHAR_ENCODING_8859_8:
				1244	return("ISO-8859-8");
				1245	case XML_CHAR_ENCODING_8859_9:
				1246	return("ISO-8859-9");
				1247	case XML_CHAR_ENCODING_2022_JP:
				1248	return("ISO-2022-JP");
				1249	case XML_CHAR_ENCODING_SHIFT_JIS:
				1250	return("Shift-JIS");
				1251	case XML_CHAR_ENCODING_EUC_JP:
				1252	return("EUC-JP");
				1253	case XML_CHAR_ENCODING_ASCII:
				1254	return(NULL);
				1255	}
				1256	return(NULL);
				1257	}
				1258
				1259	/****************************************************************
				1260	* *
				1261	* Char encoding handlers *
				1262	* *
				1263	****************************************************************/
				1264
				1265	/* the size should be growable, but it's not a big deal ... */
				1266	#define MAX_ENCODING_HANDLERS 50
				1267	static xmlCharEncodingHandlerPtr *handlers = NULL;
				1268	static int nbCharEncodingHandler = 0;
				1269
				1270	/*
				1271	* The default is UTF-8 for XML, that's also the default used for the
				1272	* parser internals, so the default encoding handler is NULL
				1273	*/
				1274
				1275	static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
				1276
				1277	/**
				1278	* xmlNewCharEncodingHandler:
				1279	* @name: the encoding name, in UTF-8 format (ASCII actually)
				1280	* @input: the xmlCharEncodingInputFunc to read that encoding
				1281	* @output: the xmlCharEncodingOutputFunc to write that encoding
				1282	*
				1283	* Create and registers an xmlCharEncodingHandler.
				1284	* Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
				1285	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1286	static xmlCharEncodingHandlerPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1287	xmlNewCharEncodingHandler(const char *name,
				1288	xmlCharEncodingInputFunc input,
				1289	xmlCharEncodingOutputFunc output) {
				1290	xmlCharEncodingHandlerPtr handler;
				1291	const char *alias;
				1292	char upper[500];
				1293	int i;
				1294	char *up = 0;
				1295
				1296	/*
				1297	* Do the alias resolution
				1298	*/
				1299	alias = xmlGetEncodingAlias(name);
				1300	if (alias != NULL)
				1301	name = alias;
				1302
				1303	/*
				1304	* Keep only the uppercase version of the encoding.
				1305	*/
				1306	if (name == NULL) {
				1307	xmlGenericError(xmlGenericErrorContext,
				1308	"xmlNewCharEncodingHandler : no name !\n");
				1309	return(NULL);
				1310	}
				1311	for (i = 0;i < 499;i++) {
				1312	upper[i] = toupper(name[i]);
				1313	if (upper[i] == 0) break;
				1314	}
				1315	upper[i] = 0;
				1316	up = xmlMemStrdup(upper);
				1317	if (up == NULL) {
				1318	xmlGenericError(xmlGenericErrorContext,
				1319	"xmlNewCharEncodingHandler : out of memory !\n");
				1320	return(NULL);
				1321	}
				1322
				1323	/*
				1324	* allocate and fill-up an handler block.
				1325	*/
				1326	handler = (xmlCharEncodingHandlerPtr)
				1327	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1328	if (handler == NULL) {
				1329	xmlGenericError(xmlGenericErrorContext,
				1330	"xmlNewCharEncodingHandler : out of memory !\n");
				1331	return(NULL);
				1332	}
				1333	handler->input = input;
				1334	handler->output = output;
				1335	handler->name = up;
				1336
				1337	#ifdef LIBXML_ICONV_ENABLED
				1338	handler->iconv_in = NULL;
				1339	handler->iconv_out = NULL;
				1340	#endif /* LIBXML_ICONV_ENABLED */
				1341
				1342	/*
				1343	* registers and returns the handler.
				1344	*/
				1345	xmlRegisterCharEncodingHandler(handler);
				1346	#ifdef DEBUG_ENCODING
				1347	xmlGenericError(xmlGenericErrorContext,
				1348	"Registered encoding handler for %s\n", name);
				1349	#endif
				1350	return(handler);
				1351	}
				1352
				1353	/**
				1354	* xmlInitCharEncodingHandlers:
				1355	*
				1356	* Initialize the char encoding support, it registers the default
				1357	* encoding supported.
				1358	* NOTE: while public, this function usually doesn't need to be called
				1359	* in normal processing.
				1360	*/
				1361	void
				1362	xmlInitCharEncodingHandlers(void) {
				1363	unsigned short int tst = 0x1234;
				1364	unsigned char ptr = (unsigned char ) &tst;
				1365
				1366	if (handlers != NULL) return;
				1367
				1368	handlers = (xmlCharEncodingHandlerPtr *)
				1369	xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
				1370
				1371	if (*ptr == 0x12) xmlLittleEndian = 0;
				1372	else if (*ptr == 0x34) xmlLittleEndian = 1;
				1373	else xmlGenericError(xmlGenericErrorContext,
				1374	"Odd problem at endianness detection\n");
				1375
				1376	if (handlers == NULL) {
				1377	xmlGenericError(xmlGenericErrorContext,
				1378	"xmlInitCharEncodingHandlers : out of memory !\n");
				1379	return;
				1380	}
				1381	xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
				1382	xmlUTF16LEHandler =
				1383	xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
				1384	xmlUTF16BEHandler =
				1385	xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
				1386	xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
				1387	xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
				1388	#ifdef LIBXML_HTML_ENABLED
				1389	xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
				1390	#endif
				1391	}
				1392
				1393	/**
				1394	* xmlCleanupCharEncodingHandlers:
				1395	*
				1396	* Cleanup the memory allocated for the char encoding support, it
				1397	* unregisters all the encoding handlers and the aliases.
				1398	*/
				1399	void
				1400	xmlCleanupCharEncodingHandlers(void) {
				1401	xmlCleanupEncodingAliases();
				1402
				1403	if (handlers == NULL) return;
				1404
				1405	for (;nbCharEncodingHandler > 0;) {
				1406	nbCharEncodingHandler--;
				1407	if (handlers[nbCharEncodingHandler] != NULL) {
				1408	if (handlers[nbCharEncodingHandler]->name != NULL)
				1409	xmlFree(handlers[nbCharEncodingHandler]->name);
				1410	xmlFree(handlers[nbCharEncodingHandler]);
				1411	}
				1412	}
				1413	xmlFree(handlers);
				1414	handlers = NULL;
				1415	nbCharEncodingHandler = 0;
				1416	xmlDefaultCharEncodingHandler = NULL;
				1417	}
				1418
				1419	/**
				1420	* xmlRegisterCharEncodingHandler:
				1421	* @handler: the xmlCharEncodingHandlerPtr handler block
				1422	*
				1423	* Register the char encoding handler, surprizing, isn't it ?
				1424	*/
				1425	void
				1426	xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
				1427	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1428	if (handler == NULL) {
				1429	xmlGenericError(xmlGenericErrorContext,
				1430	"xmlRegisterCharEncodingHandler: NULL handler !\n");
				1431	return;
				1432	}
				1433
				1434	if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
				1435	xmlGenericError(xmlGenericErrorContext,
				1436	"xmlRegisterCharEncodingHandler: Too many handler registered\n");
				1437	xmlGenericError(xmlGenericErrorContext,
				1438	"\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
				1439	return;
				1440	}
				1441	handlers[nbCharEncodingHandler++] = handler;
				1442	}
				1443
				1444	/**
				1445	* xmlGetCharEncodingHandler:
				1446	* @enc: an xmlCharEncoding value.
				1447	*
				1448	* Search in the registrered set the handler able to read/write that encoding.
				1449	*
				1450	* Returns the handler or NULL if not found
				1451	*/
				1452	xmlCharEncodingHandlerPtr
				1453	xmlGetCharEncodingHandler(xmlCharEncoding enc) {
				1454	xmlCharEncodingHandlerPtr handler;
				1455
				1456	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1457	switch (enc) {
				1458	case XML_CHAR_ENCODING_ERROR:
				1459	return(NULL);
				1460	case XML_CHAR_ENCODING_NONE:
				1461	return(NULL);
				1462	case XML_CHAR_ENCODING_UTF8:
				1463	return(NULL);
				1464	case XML_CHAR_ENCODING_UTF16LE:
				1465	return(xmlUTF16LEHandler);
				1466	case XML_CHAR_ENCODING_UTF16BE:
				1467	return(xmlUTF16BEHandler);
				1468	case XML_CHAR_ENCODING_EBCDIC:
				1469	handler = xmlFindCharEncodingHandler("EBCDIC");
				1470	if (handler != NULL) return(handler);
				1471	handler = xmlFindCharEncodingHandler("ebcdic");
				1472	if (handler != NULL) return(handler);
				1473	break;
				1474	case XML_CHAR_ENCODING_UCS4BE:
				1475	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1476	if (handler != NULL) return(handler);
				1477	handler = xmlFindCharEncodingHandler("UCS-4");
				1478	if (handler != NULL) return(handler);
				1479	handler = xmlFindCharEncodingHandler("UCS4");
				1480	if (handler != NULL) return(handler);
				1481	break;
				1482	case XML_CHAR_ENCODING_UCS4LE:
				1483	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1484	if (handler != NULL) return(handler);
				1485	handler = xmlFindCharEncodingHandler("UCS-4");
				1486	if (handler != NULL) return(handler);
				1487	handler = xmlFindCharEncodingHandler("UCS4");
				1488	if (handler != NULL) return(handler);
				1489	break;
				1490	case XML_CHAR_ENCODING_UCS4_2143:
				1491	break;
				1492	case XML_CHAR_ENCODING_UCS4_3412:
				1493	break;
				1494	case XML_CHAR_ENCODING_UCS2:
				1495	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
				1496	if (handler != NULL) return(handler);
				1497	handler = xmlFindCharEncodingHandler("UCS-2");
				1498	if (handler != NULL) return(handler);
				1499	handler = xmlFindCharEncodingHandler("UCS2");
				1500	if (handler != NULL) return(handler);
				1501	break;
				1502
				1503	/*
				1504	* We used to keep ISO Latin encodings native in the
				1505	* generated data. This led to so many problems that
				1506	* this has been removed. One can still change this
				1507	* back by registering no-ops encoders for those
				1508	*/
				1509	case XML_CHAR_ENCODING_8859_1:
				1510	handler = xmlFindCharEncodingHandler("ISO-8859-1");
				1511	if (handler != NULL) return(handler);
				1512	break;
				1513	case XML_CHAR_ENCODING_8859_2:
				1514	handler = xmlFindCharEncodingHandler("ISO-8859-2");
				1515	if (handler != NULL) return(handler);
				1516	break;
				1517	case XML_CHAR_ENCODING_8859_3:
				1518	handler = xmlFindCharEncodingHandler("ISO-8859-3");
				1519	if (handler != NULL) return(handler);
				1520	break;
				1521	case XML_CHAR_ENCODING_8859_4:
				1522	handler = xmlFindCharEncodingHandler("ISO-8859-4");
				1523	if (handler != NULL) return(handler);
				1524	break;
				1525	case XML_CHAR_ENCODING_8859_5:
				1526	handler = xmlFindCharEncodingHandler("ISO-8859-5");
				1527	if (handler != NULL) return(handler);
				1528	break;
				1529	case XML_CHAR_ENCODING_8859_6:
				1530	handler = xmlFindCharEncodingHandler("ISO-8859-6");
				1531	if (handler != NULL) return(handler);
				1532	break;
				1533	case XML_CHAR_ENCODING_8859_7:
				1534	handler = xmlFindCharEncodingHandler("ISO-8859-7");
				1535	if (handler != NULL) return(handler);
				1536	break;
				1537	case XML_CHAR_ENCODING_8859_8:
				1538	handler = xmlFindCharEncodingHandler("ISO-8859-8");
				1539	if (handler != NULL) return(handler);
				1540	break;
				1541	case XML_CHAR_ENCODING_8859_9:
				1542	handler = xmlFindCharEncodingHandler("ISO-8859-9");
				1543	if (handler != NULL) return(handler);
				1544	break;
				1545
				1546
				1547	case XML_CHAR_ENCODING_2022_JP:
				1548	handler = xmlFindCharEncodingHandler("ISO-2022-JP");
				1549	if (handler != NULL) return(handler);
				1550	break;
				1551	case XML_CHAR_ENCODING_SHIFT_JIS:
				1552	handler = xmlFindCharEncodingHandler("SHIFT-JIS");
				1553	if (handler != NULL) return(handler);
				1554	handler = xmlFindCharEncodingHandler("SHIFT_JIS");
				1555	if (handler != NULL) return(handler);
				1556	handler = xmlFindCharEncodingHandler("Shift_JIS");
				1557	if (handler != NULL) return(handler);
				1558	break;
				1559	case XML_CHAR_ENCODING_EUC_JP:
				1560	handler = xmlFindCharEncodingHandler("EUC-JP");
				1561	if (handler != NULL) return(handler);
				1562	break;
				1563	default:
				1564	break;
				1565	}
				1566
				1567	#ifdef DEBUG_ENCODING
				1568	xmlGenericError(xmlGenericErrorContext,
				1569	"No handler found for encoding %d\n", enc);
				1570	#endif
				1571	return(NULL);
				1572	}
				1573
				1574	/**
				1575	* xmlGetCharEncodingHandler:
				1576	* @enc: a string describing the char encoding.
				1577	*
				1578	* Search in the registrered set the handler able to read/write that encoding.
				1579	*
				1580	* Returns the handler or NULL if not found
				1581	*/
				1582	xmlCharEncodingHandlerPtr
				1583	xmlFindCharEncodingHandler(const char *name) {
				1584	const char *nalias;
				1585	const char *norig;
				1586	xmlCharEncoding alias;
				1587	#ifdef LIBXML_ICONV_ENABLED
				1588	xmlCharEncodingHandlerPtr enc;
				1589	iconv_t icv_in, icv_out;
				1590	#endif /* LIBXML_ICONV_ENABLED */
				1591	char upper[100];
				1592	int i;
				1593
				1594	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1595	if (name == NULL) return(xmlDefaultCharEncodingHandler);
				1596	if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
				1597
				1598	/*
				1599	* Do the alias resolution
				1600	*/
				1601	norig = name;
				1602	nalias = xmlGetEncodingAlias(name);
				1603	if (nalias != NULL)
				1604	name = nalias;
				1605
				1606	/*
				1607	* Check first for directly registered encoding names
				1608	*/
				1609	for (i = 0;i < 99;i++) {
				1610	upper[i] = toupper(name[i]);
				1611	if (upper[i] == 0) break;
				1612	}
				1613	upper[i] = 0;
				1614
				1615	for (i = 0;i < nbCharEncodingHandler; i++)
				1616	if (!strcmp(upper, handlers[i]->name)) {
				1617	#ifdef DEBUG_ENCODING
				1618	xmlGenericError(xmlGenericErrorContext,
				1619	"Found registered handler for encoding %s\n", name);
				1620	#endif
				1621	return(handlers[i]);
				1622	}
				1623
				1624	#ifdef LIBXML_ICONV_ENABLED
				1625	/* check whether iconv can handle this */
				1626	icv_in = iconv_open("UTF-8", name);
				1627	icv_out = iconv_open(name, "UTF-8");
				1628	if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
				1629	enc = (xmlCharEncodingHandlerPtr)
				1630	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1631	if (enc == NULL) {
				1632	iconv_close(icv_in);
				1633	iconv_close(icv_out);
				1634	return(NULL);
				1635	}
				1636	enc->name = xmlMemStrdup(name);
				1637	enc->input = NULL;
				1638	enc->output = NULL;
				1639	enc->iconv_in = icv_in;
				1640	enc->iconv_out = icv_out;
				1641	#ifdef DEBUG_ENCODING
				1642	xmlGenericError(xmlGenericErrorContext,
				1643	"Found iconv handler for encoding %s\n", name);
				1644	#endif
				1645	return enc;
				1646	} else if ((icv_in != (iconv_t) -1) \|\| icv_out != (iconv_t) -1) {
				1647	xmlGenericError(xmlGenericErrorContext,
				1648	"iconv : problems with filters for '%s'\n", name);
				1649	}
				1650	#endif /* LIBXML_ICONV_ENABLED */
				1651
				1652	#ifdef DEBUG_ENCODING
				1653	xmlGenericError(xmlGenericErrorContext,
				1654	"No handler found for encoding %s\n", name);
				1655	#endif
				1656
				1657	/*
				1658	* Fallback using the canonical names
				1659	*/
				1660	alias = xmlParseCharEncoding(norig);
				1661	if (alias != XML_CHAR_ENCODING_ERROR) {
				1662	const char* canon;
				1663	canon = xmlGetCharEncodingName(alias);
				1664	if ((canon != NULL) && (strcmp(name, canon))) {
				1665	return(xmlFindCharEncodingHandler(canon));
				1666	}
				1667	}
				1668
				1669	return(NULL);
				1670	}
				1671
				1672	#ifdef LIBXML_ICONV_ENABLED
				1673	/**
				1674	* xmlIconvWrapper:
				1675	* @cd: iconv converter data structure
				1676	* @out: a pointer to an array of bytes to store the result
				1677	* @outlen: the length of @out
				1678	* @in: a pointer to an array of ISO Latin 1 chars
				1679	* @inlen: the length of @in
				1680	*
				1681	* Returns 0 if success, or
				1682	* -1 by lack of space, or
				1683	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1684	* the result of transformation can't fit into the encoding we want), or
				1685	* -3 if there the last byte can't form a single output char.
				1686	*
				1687	* The value of @inlen after return is the number of octets consumed
				1688	* as the return value is positive, else unpredictiable.
				1689	* The value of @outlen after return is the number of ocetes consumed.
				1690	*/
				1691	static int
				1692	xmlIconvWrapper(iconv_t cd,
				1693	unsigned char out, int outlen,
				1694	const unsigned char in, int inlen) {
				1695
				1696	size_t icv_inlen = inlen, icv_outlen = outlen;
				1697	const char icv_in = (const char ) in;
				1698	char icv_out = (char ) out;
				1699	int ret;
				1700
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1701	ret = iconv(cd, &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1702	if (in != NULL) {
				1703	*inlen -= icv_inlen;
				1704	*outlen -= icv_outlen;
				1705	} else {
				1706	*inlen = 0;
				1707	*outlen = 0;
				1708	}
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1709	if ((icv_inlen != 0) \|\| (ret == -1)) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1710	#ifdef EILSEQ
				1711	if (errno == EILSEQ) {
				1712	return -2;
				1713	} else
				1714	#endif
				1715	#ifdef E2BIG
				1716	if (errno == E2BIG) {
				1717	return -1;
				1718	} else
				1719	#endif
				1720	#ifdef EINVAL
				1721	if (errno == EINVAL) {
				1722	return -3;
				1723	} else
				1724	#endif
				1725	{
				1726	return -3;
				1727	}
				1728	}
				1729	return 0;
				1730	}
				1731	#endif /* LIBXML_ICONV_ENABLED */
				1732
				1733	/**
				1734	* xmlCharEncFirstLine:
				1735	* @handler: char enconding transformation data structure
				1736	* @out: an xmlBuffer for the output.
				1737	* @in: an xmlBuffer for the input
				1738	*
				1739	* Front-end for the encoding handler input function, but handle only
				1740	* the very first line, i.e. limit itself to 45 chars.
				1741	*
				1742	* Returns the number of byte written if success, or
				1743	* -1 general error
				1744	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1745	* the result of transformation can't fit into the encoding we want), or
				1746	*/
				1747	int
				1748	xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1749	xmlBufferPtr in) {
				1750	int ret = -2;
				1751	int written;
				1752	int toconv;
				1753
				1754	if (handler == NULL) return(-1);
				1755	if (out == NULL) return(-1);
				1756	if (in == NULL) return(-1);
				1757
				1758	written = out->size - out->use;
				1759	toconv = in->use;
				1760	if (toconv * 2 >= written) {
				1761	xmlBufferGrow(out, toconv);
				1762	written = out->size - out->use - 1;
				1763	}
				1764
				1765	/*
				1766	* echo '<?xml version="1.0" encoding="UCS4"?>' \| wc -c => 38
				1767	* 45 chars should be sufficient to reach the end of the encoding
				1768	* decalration without going too far inside the document content.
				1769	*/
				1770	written = 45;
				1771
				1772	if (handler->input != NULL) {
				1773	ret = handler->input(&out->content[out->use], &written,
				1774	in->content, &toconv);
				1775	xmlBufferShrink(in, toconv);
				1776	out->use += written;
				1777	out->content[out->use] = 0;
				1778	}
				1779	#ifdef LIBXML_ICONV_ENABLED
				1780	else if (handler->iconv_in != NULL) {
				1781	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				1782	&written, in->content, &toconv);
				1783	xmlBufferShrink(in, toconv);
				1784	out->use += written;
				1785	out->content[out->use] = 0;
				1786	if (ret == -1) ret = -3;
				1787	}
				1788	#endif /* LIBXML_ICONV_ENABLED */
				1789	#ifdef DEBUG_ENCODING
				1790	switch (ret) {
				1791	case 0:
				1792	xmlGenericError(xmlGenericErrorContext,
				1793	"converted %d bytes to %d bytes of input\n",
				1794	toconv, written);
				1795	break;
				1796	case -1:
				1797	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				1798	toconv, written, in->use);
				1799	break;
				1800	case -2:
				1801	xmlGenericError(xmlGenericErrorContext,
				1802	"input conversion failed due to input error\n");
				1803	break;
				1804	case -3:
				1805	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				1806	toconv, written, in->use);
				1807	break;
				1808	default:
				1809	xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
				1810	}
				1811	#endif
				1812	/*
				1813	* Ignore when input buffer is not on a boundary
				1814	*/
				1815	if (ret == -3) ret = 0;
				1816	if (ret == -1) ret = 0;
				1817	return(ret);
				1818	}
				1819
				1820	/**
				1821	* xmlCharEncInFunc:
				1822	* @handler: char enconding transformation data structure
				1823	* @out: an xmlBuffer for the output.
				1824	* @in: an xmlBuffer for the input
				1825	*
				1826	* Generic front-end for the encoding handler input function
				1827	*
				1828	* Returns the number of byte written if success, or
				1829	* -1 general error
				1830	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1831	* the result of transformation can't fit into the encoding we want), or
				1832	*/
				1833	int
				1834	xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1835	xmlBufferPtr in) {
				1836	int ret = -2;
				1837	int written;
				1838	int toconv;
				1839
				1840	if (handler == NULL) return(-1);
				1841	if (out == NULL) return(-1);
				1842	if (in == NULL) return(-1);
				1843
				1844	toconv = in->use;
				1845	if (toconv == 0)
				1846	return(0);
				1847	written = out->size - out->use;
				1848	if (toconv * 2 >= written) {
				1849	xmlBufferGrow(out, out->size + toconv * 2);
				1850	written = out->size - out->use - 1;
				1851	}
				1852	if (handler->input != NULL) {
				1853	ret = handler->input(&out->content[out->use], &written,
				1854	in->content, &toconv);
				1855	xmlBufferShrink(in, toconv);
				1856	out->use += written;
				1857	out->content[out->use] = 0;
				1858	}
				1859	#ifdef LIBXML_ICONV_ENABLED
				1860	else if (handler->iconv_in != NULL) {
				1861	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				1862	&written, in->content, &toconv);
				1863	xmlBufferShrink(in, toconv);
				1864	out->use += written;
				1865	out->content[out->use] = 0;
				1866	if (ret == -1) ret = -3;
				1867	}
				1868	#endif /* LIBXML_ICONV_ENABLED */
				1869	switch (ret) {
				1870	#ifdef DEBUG_ENCODING
				1871	case 0:
				1872	xmlGenericError(xmlGenericErrorContext,
				1873	"converted %d bytes to %d bytes of input\n",
				1874	toconv, written);
				1875	break;
				1876	case -1:
				1877	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				1878	toconv, written, in->use);
				1879	break;
				1880	case -3:
				1881	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				1882	toconv, written, in->use);
				1883	break;
				1884	#endif
				1885	case -2:
				1886	xmlGenericError(xmlGenericErrorContext,
				1887	"input conversion failed due to input error\n");
				1888	xmlGenericError(xmlGenericErrorContext,
				1889	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				1890	in->content[0], in->content[1],
				1891	in->content[2], in->content[3]);
				1892	}
				1893	/*
				1894	* Ignore when input buffer is not on a boundary
				1895	*/
				1896	if (ret == -3) ret = 0;
				1897	return(ret);
				1898	}
				1899
				1900	/**
				1901	* xmlCharEncOutFunc:
				1902	* @handler: char enconding transformation data structure
				1903	* @out: an xmlBuffer for the output.
				1904	* @in: an xmlBuffer for the input
				1905	*
				1906	* Generic front-end for the encoding handler output function
				1907	* a first call with @in == NULL has to be made firs to initiate the
				1908	* output in case of non-stateless encoding needing to initiate their
				1909	* state or the output (like the BOM in UTF16).
				1910	* In case of UTF8 sequence conversion errors for the given encoder,
				1911	* the content will be automatically remapped to a CharRef sequence.
				1912	*
				1913	* Returns the number of byte written if success, or
				1914	* -1 general error
				1915	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1916	* the result of transformation can't fit into the encoding we want), or
				1917	*/
				1918	int
				1919	xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1920	xmlBufferPtr in) {
				1921	int ret = -2;
				1922	int written;
				1923	int writtentot = 0;
				1924	int toconv;
				1925	int output = 0;
				1926
				1927	if (handler == NULL) return(-1);
				1928	if (out == NULL) return(-1);
				1929
				1930	retry:
				1931
				1932	written = out->size - out->use;
				1933
				1934	/*
				1935	* First specific handling of in = NULL, i.e. the initialization call
				1936	*/
				1937	if (in == NULL) {
				1938	toconv = 0;
				1939	if (handler->output != NULL) {
				1940	ret = handler->output(&out->content[out->use], &written,
				1941	NULL, &toconv);
				1942	out->use += written;
				1943	out->content[out->use] = 0;
				1944	}
				1945	#ifdef LIBXML_ICONV_ENABLED
				1946	else if (handler->iconv_out != NULL) {
				1947	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				1948	&written, NULL, &toconv);
				1949	out->use += written;
				1950	out->content[out->use] = 0;
				1951	}
				1952	#endif /* LIBXML_ICONV_ENABLED */
				1953	#ifdef DEBUG_ENCODING
				1954	xmlGenericError(xmlGenericErrorContext,
				1955	"initialized encoder\n");
				1956	#endif
				1957	return(0);
				1958	}
				1959
				1960	/*
				1961	* Convertion itself.
				1962	*/
				1963	toconv = in->use;
				1964	if (toconv == 0)
				1965	return(0);
				1966	if (toconv * 2 >= written) {
				1967	xmlBufferGrow(out, toconv * 2);
				1968	written = out->size - out->use - 1;
				1969	}
				1970	if (handler->output != NULL) {
				1971	ret = handler->output(&out->content[out->use], &written,
				1972	in->content, &toconv);
				1973	xmlBufferShrink(in, toconv);
				1974	out->use += written;
				1975	writtentot += written;
				1976	out->content[out->use] = 0;
				1977	}
				1978	#ifdef LIBXML_ICONV_ENABLED
				1979	else if (handler->iconv_out != NULL) {
				1980	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				1981	&written, in->content, &toconv);
				1982	xmlBufferShrink(in, toconv);
				1983	out->use += written;
				1984	writtentot += written;
				1985	out->content[out->use] = 0;
				1986	if (ret == -1) {
				1987	if (written > 0) {
				1988	/*
				1989	* Can be a limitation of iconv
				1990	*/
				1991	goto retry;
				1992	}
				1993	ret = -3;
				1994	}
				1995	}
				1996	#endif /* LIBXML_ICONV_ENABLED */
				1997	else {
				1998	xmlGenericError(xmlGenericErrorContext,
				1999	"xmlCharEncOutFunc: no output function !\n");
				2000	return(-1);
				2001	}
				2002
				2003	if (ret >= 0) output += ret;
				2004
				2005	/*
				2006	* Attempt to handle error cases
				2007	*/
				2008	switch (ret) {
				2009	#ifdef DEBUG_ENCODING
				2010	case 0:
				2011	xmlGenericError(xmlGenericErrorContext,
				2012	"converted %d bytes to %d bytes of output\n",
				2013	toconv, written);
				2014	break;
				2015	case -1:
				2016	xmlGenericError(xmlGenericErrorContext,
				2017	"output conversion failed by lack of space\n");
				2018	break;
				2019	#endif
				2020	case -3:
				2021	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
				2022	toconv, written, in->use);
				2023	break;
				2024	case -2: {
				2025	int len = in->use;
				2026	const xmlChar utf = (const xmlChar ) in->content;
				2027	int cur;
				2028
				2029	cur = xmlGetUTF8Char(utf, &len);
				2030	if (cur > 0) {
				2031	xmlChar charref[20];
				2032
				2033	#ifdef DEBUG_ENCODING
				2034	xmlGenericError(xmlGenericErrorContext,
				2035	"handling output conversion error\n");
				2036	xmlGenericError(xmlGenericErrorContext,
				2037	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2038	in->content[0], in->content[1],
				2039	in->content[2], in->content[3]);
				2040	#endif
				2041	/*
				2042	* Removes the UTF8 sequence, and replace it by a charref
				2043	* and continue the transcoding phase, hoping the error
				2044	* did not mangle the encoder state.
				2045	*/
				2046	sprintf((char *) charref, "&#x%X;", cur);
				2047	xmlBufferShrink(in, len);
				2048	xmlBufferAddHead(in, charref, -1);
				2049
				2050	goto retry;
				2051	} else {
				2052	xmlGenericError(xmlGenericErrorContext,
				2053	"output conversion failed due to conv error\n");
				2054	xmlGenericError(xmlGenericErrorContext,
				2055	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2056	in->content[0], in->content[1],
				2057	in->content[2], in->content[3]);
				2058	in->content[0] = ' ';
				2059	}
				2060	break;
				2061	}
				2062	}
				2063	return(ret);
				2064	}
				2065
				2066	/**
				2067	* xmlCharEncCloseFunc:
				2068	* @handler: char enconding transformation data structure
				2069	*
				2070	* Generic front-end for hencoding handler close function
				2071	*
				2072	* Returns 0 if success, or -1 in case of error
				2073	*/
				2074	int
				2075	xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
				2076	int ret = 0;
				2077	if (handler == NULL) return(-1);
				2078	if (handler->name == NULL) return(-1);
				2079	#ifdef LIBXML_ICONV_ENABLED
				2080	/*
				2081	* Iconv handlers can be oused only once, free the whole block.
				2082	* and the associated icon resources.
				2083	*/
				2084	if ((handler->iconv_out != NULL) \|\| (handler->iconv_in != NULL)) {
				2085	if (handler->name != NULL)
				2086	xmlFree(handler->name);
				2087	handler->name = NULL;
				2088	if (handler->iconv_out != NULL) {
				2089	if (iconv_close(handler->iconv_out))
				2090	ret = -1;
				2091	handler->iconv_out = NULL;
				2092	}
				2093	if (handler->iconv_in != NULL) {
				2094	if (iconv_close(handler->iconv_in))
				2095	ret = -1;
				2096	handler->iconv_in = NULL;
				2097	}
				2098	xmlFree(handler);
				2099	}
				2100	#endif /* LIBXML_ICONV_ENABLED */
				2101	#ifdef DEBUG_ENCODING
				2102	if (ret)
				2103	xmlGenericError(xmlGenericErrorContext,
				2104	"failed to close the encoding handler\n");
				2105	else
				2106	xmlGenericError(xmlGenericErrorContext,
				2107	"closed the encoding handler\n");
				2108
				2109	#endif
				2110	return(ret);
				2111	}
				2112