Blame - encoding.c - fp2-dev/platform/external/libxml2

blob: db7b0cf036592719fabd0f9430aaa64524b20dc8 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* encoding.c : implements the encoding conversion functions needed for XML
				3	*
				4	* Related specs:
				5	* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
				6	* rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
				7	* [ISO-10646] UTF-8 and UTF-16 in Annexes
				8	* [ISO-8859-1] ISO Latin-1 characters codes.
				9	* [UNICODE] The Unicode Consortium, "The Unicode Standard --
				10	* Worldwide Character Encoding -- Version 1.0", Addison-
				11	* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
				12	* described in Unicode Technical Report #4.
				13	* [US-ASCII] Coded Character Set--7-bit American Standard Code for
				14	* Information Interchange, ANSI X3.4-1986.
				15	*
				16	* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
				17	*
				18	* See Copyright for the status of this software.
				19	*
				20	* Daniel.Veillard@w3.org
				21	*/
				22
				23	#ifdef WIN32
				24	#include "win32config.h"
				25	#else
				26	#include "config.h"
				27	#endif
				28
				29	#include <stdio.h>
				30	#include <string.h>
				31
				32	#ifdef HAVE_CTYPE_H
				33	#include <ctype.h>
				34	#endif
				35	#ifdef HAVE_STDLIB_H
				36	#include <stdlib.h>
				37	#endif
				38	#include <libxml/xmlversion.h>
				39	#ifdef LIBXML_ICONV_ENABLED
				40	#ifdef HAVE_ERRNO_H
				41	#include <errno.h>
				42	#endif
				43	#endif
				44	#include <libxml/encoding.h>
				45	#include <libxml/xmlmemory.h>
				46	#ifdef LIBXML_HTML_ENABLED
				47	#include <libxml/HTMLparser.h>
				48	#endif
				49	#include <libxml/xmlerror.h>
				50
				51	xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
				52	xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
				53
				54	typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
				55	typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
				56	struct _xmlCharEncodingAlias {
				57	const char *name;
				58	const char *alias;
				59	};
				60
				61	static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
				62	static int xmlCharEncodingAliasesNb = 0;
				63	static int xmlCharEncodingAliasesMax = 0;
				64
				65	#ifdef LIBXML_ICONV_ENABLED
				66	#if 0
				67	#define DEBUG_ENCODING /* Define this to get encoding traces */
				68	#endif
				69	#endif
				70
				71	static int xmlLittleEndian = 1;
				72
				73	/*
				74	* From rfc2044: encoding of the Unicode values on UTF-8:
				75	*
				76	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				77	* 0000 0000-0000 007F 0xxxxxxx
				78	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				79	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				80	*
				81	* I hope we won't use values > 0xFFFF anytime soon !
				82	*/
				83
				84	/**
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame^]	85	* xmlUTF8Strlen:
				86	* @utf: a sequence of UTF-8 encoded bytes
				87	*
				88	* compute the lenght of an UTF8 string, it doesn't do a full UTF8
				89	* checking of the content of the string.
				90	*
				91	* Returns the number of characters in the string or -1 in case of error
				92	*/
				93	int
				94	xmlUTF8Strlen(const unsigned char *utf) {
				95	int ret = 0;
				96
				97	if (utf == NULL)
				98	return(-1);
				99
				100	while (*utf != 0) {
				101	if (utf[0] & 0x80) {
				102	if ((utf[1] & 0xc0) != 0x80)
				103	return(-1);
				104	if ((utf[0] & 0xe0) == 0xe0) {
				105	if ((utf[2] & 0xc0) != 0x80)
				106	return(-1);
				107	if ((utf[0] & 0xf0) == 0xf0) {
				108	if ((utf[0] & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				109	return(-1);
				110	utf += 4;
				111	} else {
				112	utf += 3;
				113	}
				114	} else {
				115	utf += 2;
				116	}
				117	} else {
				118	utf++;
				119	}
				120	ret++;
				121	}
				122	return(ret);
				123	}
				124
				125	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	126	* xmlGetUTF8Char:
				127	* @utf: a sequence of UTF-8 encoded bytes
				128	* @len: a pointer to @bytes len
				129	*
				130	* Read one UTF8 Char from @utf
				131	*
				132	* Returns the char value or -1 in case of error and update @len with the
				133	* number of bytes used
				134	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	135	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	136	xmlGetUTF8Char(const unsigned char utf, int len) {
				137	unsigned int c;
				138
				139	if (utf == NULL)
				140	goto error;
				141	if (len == NULL)
				142	goto error;
				143	if (*len < 1)
				144	goto error;
				145
				146	c = utf[0];
				147	if (c & 0x80) {
				148	if (*len < 2)
				149	goto error;
				150	if ((utf[1] & 0xc0) != 0x80)
				151	goto error;
				152	if ((c & 0xe0) == 0xe0) {
				153	if (*len < 3)
				154	goto error;
				155	if ((utf[2] & 0xc0) != 0x80)
				156	goto error;
				157	if ((c & 0xf0) == 0xf0) {
				158	if (*len < 4)
				159	goto error;
				160	if ((c & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				161	goto error;
				162	*len = 4;
				163	/* 4-byte code */
				164	c = (utf[0] & 0x7) << 18;
				165	c \|= (utf[1] & 0x3f) << 12;
				166	c \|= (utf[2] & 0x3f) << 6;
				167	c \|= utf[3] & 0x3f;
				168	} else {
				169	/* 3-byte code */
				170	*len = 3;
				171	c = (utf[0] & 0xf) << 12;
				172	c \|= (utf[1] & 0x3f) << 6;
				173	c \|= utf[2] & 0x3f;
				174	}
				175	} else {
				176	/* 2-byte code */
				177	*len = 2;
				178	c = (utf[0] & 0x1f) << 6;
				179	c \|= utf[1] & 0x3f;
				180	}
				181	} else {
				182	/* 1-byte code */
				183	*len = 1;
				184	}
				185	return(c);
				186
				187	error:
				188	*len = 0;
				189	return(-1);
				190	}
				191
				192	/**
				193	* xmlCheckUTF8: Check utf-8 string for legality.
				194	* @utf: Pointer to putative utf-8 encoded string.
				195	*
				196	* Checks @utf for being valid utf-8. @utf is assumed to be
				197	* null-terminated. This function is not super-strict, as it will
				198	* allow longer utf-8 sequences than necessary. Note that Java is
				199	* capable of producing these sequences if provoked. Also note, this
				200	* routine checks for the 4-byte maxiumum size, but does not check for
				201	* 0x10ffff maximum value.
				202	*
				203	* Return value: true if @utf is valid.
				204	**/
				205	int
				206	xmlCheckUTF8(const unsigned char *utf)
				207	{
				208	int ix;
				209	unsigned char c;
				210
				211	for (ix = 0; (c = utf[ix]);) {
				212	if (c & 0x80) {
				213	if ((utf[ix + 1] & 0xc0) != 0x80)
				214	return(0);
				215	if ((c & 0xe0) == 0xe0) {
				216	if ((utf[ix + 2] & 0xc0) != 0x80)
				217	return(0);
				218	if ((c & 0xf0) == 0xf0) {
				219	if ((c & 0xf8) != 0xf0 \|\| (utf[ix + 3] & 0xc0) != 0x80)
				220	return(0);
				221	ix += 4;
				222	/* 4-byte code */
				223	} else
				224	/* 3-byte code */
				225	ix += 3;
				226	} else
				227	/* 2-byte code */
				228	ix += 2;
				229	} else
				230	/* 1-byte code */
				231	ix++;
				232	}
				233	return(1);
				234	}
				235
				236	/**
				237	* asciiToUTF8:
				238	* @out: a pointer to an array of bytes to store the result
				239	* @outlen: the length of @out
				240	* @in: a pointer to an array of ASCII chars
				241	* @inlen: the length of @in
				242	*
				243	* Take a block of ASCII chars in and try to convert it to an UTF-8
				244	* block of chars out.
				245	* Returns 0 if success, or -1 otherwise
				246	* The value of @inlen after return is the number of octets consumed
				247	* as the return value is positive, else unpredictiable.
				248	* The value of @outlen after return is the number of ocetes consumed.
				249	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	250	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	251	asciiToUTF8(unsigned char* out, int *outlen,
				252	const unsigned char* in, int *inlen) {
				253	unsigned char* outstart = out;
				254	const unsigned char* base = in;
				255	const unsigned char* processed = in;
				256	unsigned char* outend = out + *outlen;
				257	const unsigned char* inend;
				258	unsigned int c;
				259	int bits;
				260
				261	inend = in + (*inlen);
				262	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				263	c= *in++;
				264
				265	/* assertion: c is a single UTF-4 value */
				266	if (out >= outend)
				267	break;
				268	if (c < 0x80) { *out++= c; bits= -6; }
				269	else {
				270	*outlen = out - outstart;
				271	*inlen = processed - base;
				272	return(-1);
				273	}
				274
				275	for ( ; bits >= 0; bits-= 6) {
				276	if (out >= outend)
				277	break;
				278	*out++= ((c >> bits) & 0x3F) \| 0x80;
				279	}
				280	processed = (const unsigned char*) in;
				281	}
				282	*outlen = out - outstart;
				283	*inlen = processed - base;
				284	return(0);
				285	}
				286
				287	/**
				288	* UTF8Toascii:
				289	* @out: a pointer to an array of bytes to store the result
				290	* @outlen: the length of @out
				291	* @in: a pointer to an array of UTF-8 chars
				292	* @inlen: the length of @in
				293	*
				294	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				295	* block of chars out.
				296	*
				297	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				298	* The value of @inlen after return is the number of octets consumed
				299	* as the return value is positive, else unpredictiable.
				300	* The value of @outlen after return is the number of ocetes consumed.
				301	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	302	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	303	UTF8Toascii(unsigned char* out, int *outlen,
				304	const unsigned char* in, int *inlen) {
				305	const unsigned char* processed = in;
				306	const unsigned char* outend;
				307	const unsigned char* outstart = out;
				308	const unsigned char* instart = in;
				309	const unsigned char* inend;
				310	unsigned int c, d;
				311	int trailing;
				312
				313	if (in == NULL) {
				314	/*
				315	* initialization nothing to do
				316	*/
				317	*outlen = 0;
				318	*inlen = 0;
				319	return(0);
				320	}
				321	inend = in + (*inlen);
				322	outend = out + (*outlen);
				323	while (in < inend) {
				324	d = *in++;
				325	if (d < 0x80) { c= d; trailing= 0; }
				326	else if (d < 0xC0) {
				327	/* trailing byte in leading position */
				328	*outlen = out - outstart;
				329	*inlen = processed - instart;
				330	return(-2);
				331	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				332	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				333	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				334	else {
				335	/* no chance for this in Ascii */
				336	*outlen = out - outstart;
				337	*inlen = processed - instart;
				338	return(-2);
				339	}
				340
				341	if (inend - in < trailing) {
				342	break;
				343	}
				344
				345	for ( ; trailing; trailing--) {
				346	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				347	break;
				348	c <<= 6;
				349	c \|= d & 0x3F;
				350	}
				351
				352	/* assertion: c is a single UTF-4 value */
				353	if (c < 0x80) {
				354	if (out >= outend)
				355	break;
				356	*out++ = c;
				357	} else {
				358	/* no chance for this in Ascii */
				359	*outlen = out - outstart;
				360	*inlen = processed - instart;
				361	return(-2);
				362	}
				363	processed = in;
				364	}
				365	*outlen = out - outstart;
				366	*inlen = processed - instart;
				367	return(0);
				368	}
				369
				370	/**
				371	* isolat1ToUTF8:
				372	* @out: a pointer to an array of bytes to store the result
				373	* @outlen: the length of @out
				374	* @in: a pointer to an array of ISO Latin 1 chars
				375	* @inlen: the length of @in
				376	*
				377	* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
				378	* block of chars out.
				379	* Returns 0 if success, or -1 otherwise
				380	* The value of @inlen after return is the number of octets consumed
				381	* as the return value is positive, else unpredictiable.
				382	* The value of @outlen after return is the number of ocetes consumed.
				383	*/
				384	int
				385	isolat1ToUTF8(unsigned char* out, int *outlen,
				386	const unsigned char* in, int *inlen) {
				387	unsigned char* outstart = out;
				388	const unsigned char* base = in;
				389	const unsigned char* processed = in;
				390	unsigned char* outend = out + *outlen;
				391	const unsigned char* inend;
				392	unsigned int c;
				393	int bits;
				394
				395	inend = in + (*inlen);
				396	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				397	c= *in++;
				398
				399	/* assertion: c is a single UTF-4 value */
				400	if (out >= outend)
				401	break;
				402	if (c < 0x80) { *out++= c; bits= -6; }
				403	else { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				404
				405	for ( ; bits >= 0; bits-= 6) {
				406	if (out >= outend)
				407	break;
				408	*out++= ((c >> bits) & 0x3F) \| 0x80;
				409	}
				410	processed = (const unsigned char*) in;
				411	}
				412	*outlen = out - outstart;
				413	*inlen = processed - base;
				414	return(0);
				415	}
				416
				417	/**
				418	* UTF8Toisolat1:
				419	* @out: a pointer to an array of bytes to store the result
				420	* @outlen: the length of @out
				421	* @in: a pointer to an array of UTF-8 chars
				422	* @inlen: the length of @in
				423	*
				424	* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
				425	* block of chars out.
				426	*
				427	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				428	* The value of @inlen after return is the number of octets consumed
				429	* as the return value is positive, else unpredictiable.
				430	* The value of @outlen after return is the number of ocetes consumed.
				431	*/
				432	int
				433	UTF8Toisolat1(unsigned char* out, int *outlen,
				434	const unsigned char* in, int *inlen) {
				435	const unsigned char* processed = in;
				436	const unsigned char* outend;
				437	const unsigned char* outstart = out;
				438	const unsigned char* instart = in;
				439	const unsigned char* inend;
				440	unsigned int c, d;
				441	int trailing;
				442
				443	if (in == NULL) {
				444	/*
				445	* initialization nothing to do
				446	*/
				447	*outlen = 0;
				448	*inlen = 0;
				449	return(0);
				450	}
				451	inend = in + (*inlen);
				452	outend = out + (*outlen);
				453	while (in < inend) {
				454	d = *in++;
				455	if (d < 0x80) { c= d; trailing= 0; }
				456	else if (d < 0xC0) {
				457	/* trailing byte in leading position */
				458	*outlen = out - outstart;
				459	*inlen = processed - instart;
				460	return(-2);
				461	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				462	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				463	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				464	else {
				465	/* no chance for this in IsoLat1 */
				466	*outlen = out - outstart;
				467	*inlen = processed - instart;
				468	return(-2);
				469	}
				470
				471	if (inend - in < trailing) {
				472	break;
				473	}
				474
				475	for ( ; trailing; trailing--) {
				476	if (in >= inend)
				477	break;
				478	if (((d= *in++) & 0xC0) != 0x80) {
				479	*outlen = out - outstart;
				480	*inlen = processed - instart;
				481	return(-2);
				482	}
				483	c <<= 6;
				484	c \|= d & 0x3F;
				485	}
				486
				487	/* assertion: c is a single UTF-4 value */
				488	if (c <= 0xFF) {
				489	if (out >= outend)
				490	break;
				491	*out++ = c;
				492	} else {
				493	/* no chance for this in IsoLat1 */
				494	*outlen = out - outstart;
				495	*inlen = processed - instart;
				496	return(-2);
				497	}
				498	processed = in;
				499	}
				500	*outlen = out - outstart;
				501	*inlen = processed - instart;
				502	return(0);
				503	}
				504
				505	/**
				506	* UTF16LEToUTF8:
				507	* @out: a pointer to an array of bytes to store the result
				508	* @outlen: the length of @out
				509	* @inb: a pointer to an array of UTF-16LE passwd as a byte array
				510	* @inlenb: the length of @in in UTF-16LE chars
				511	*
				512	* Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
				513	* block of chars out. This function assume the endian properity
				514	* is the same between the native type of this machine and the
				515	* inputed one.
				516	*
				517	* Returns the number of byte written, or -1 by lack of space, or -2
				518	* if the transcoding fails (for *in is not valid utf16 string)
				519	* The value of *inlen after return is the number of octets consumed
				520	* as the return value is positive, else unpredictiable.
				521	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	522	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	523	UTF16LEToUTF8(unsigned char* out, int *outlen,
				524	const unsigned char* inb, int *inlenb)
				525	{
				526	unsigned char* outstart = out;
				527	const unsigned char* processed = inb;
				528	unsigned char* outend = out + *outlen;
				529	unsigned short* in = (unsigned short*) inb;
				530	unsigned short* inend;
				531	unsigned int c, d, inlen;
				532	unsigned char *tmp;
				533	int bits;
				534
				535	if ((*inlenb % 2) == 1)
				536	(*inlenb)--;
				537	inlen = *inlenb / 2;
				538	inend = in + inlen;
				539	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				540	if (xmlLittleEndian) {
				541	c= *in++;
				542	} else {
				543	tmp = (unsigned char *) in;
				544	c = *tmp++;
				545	c = c \| (((unsigned int)*tmp) << 8);
				546	in++;
				547	}
				548	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				549	if (in >= inend) { /* (in > inend) shouldn't happens */
				550	break;
				551	}
				552	if (xmlLittleEndian) {
				553	d = *in++;
				554	} else {
				555	tmp = (unsigned char *) in;
				556	d = *tmp++;
				557	d = d \| (((unsigned int)*tmp) << 8);
				558	in++;
				559	}
				560	if ((d & 0xFC00) == 0xDC00) {
				561	c &= 0x03FF;
				562	c <<= 10;
				563	c \|= d & 0x03FF;
				564	c += 0x10000;
				565	}
				566	else {
				567	*outlen = out - outstart;
				568	*inlenb = processed - inb;
				569	return(-2);
				570	}
				571	}
				572
				573	/* assertion: c is a single UTF-4 value */
				574	if (out >= outend)
				575	break;
				576	if (c < 0x80) { *out++= c; bits= -6; }
				577	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				578	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				579	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				580
				581	for ( ; bits >= 0; bits-= 6) {
				582	if (out >= outend)
				583	break;
				584	*out++= ((c >> bits) & 0x3F) \| 0x80;
				585	}
				586	processed = (const unsigned char*) in;
				587	}
				588	*outlen = out - outstart;
				589	*inlenb = processed - inb;
				590	return(0);
				591	}
				592
				593	/**
				594	* UTF8ToUTF16LE:
				595	* @outb: a pointer to an array of bytes to store the result
				596	* @outlen: the length of @outb
				597	* @in: a pointer to an array of UTF-8 chars
				598	* @inlen: the length of @in
				599	*
				600	* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
				601	* block of chars out.
				602	*
				603	* Returns the number of byte written, or -1 by lack of space, or -2
				604	* if the transcoding failed.
				605	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	606	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	607	UTF8ToUTF16LE(unsigned char* outb, int *outlen,
				608	const unsigned char* in, int *inlen)
				609	{
				610	unsigned short* out = (unsigned short*) outb;
				611	const unsigned char* processed = in;
				612	unsigned short* outstart= out;
				613	unsigned short* outend;
				614	const unsigned char* inend= in+*inlen;
				615	unsigned int c, d;
				616	int trailing;
				617	unsigned char *tmp;
				618	unsigned short tmp1, tmp2;
				619
				620	if (in == NULL) {
				621	/*
				622	* initialization, add the Byte Order Mark
				623	*/
				624	if (*outlen >= 2) {
				625	outb[0] = 0xFF;
				626	outb[1] = 0xFE;
				627	*outlen = 2;
				628	*inlen = 0;
				629	#ifdef DEBUG_ENCODING
				630	xmlGenericError(xmlGenericErrorContext,
				631	"Added FFFE Byte Order Mark\n");
				632	#endif
				633	return(2);
				634	}
				635	*outlen = 0;
				636	*inlen = 0;
				637	return(0);
				638	}
				639	outend = out + (*outlen / 2);
				640	while (in < inend) {
				641	d= *in++;
				642	if (d < 0x80) { c= d; trailing= 0; }
				643	else if (d < 0xC0) {
				644	/* trailing byte in leading position */
				645	outlen = (out - outstart) 2;
				646	*inlen = processed - in;
				647	return(-2);
				648	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				649	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				650	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				651	else {
				652	/* no chance for this in UTF-16 */
				653	outlen = (out - outstart) 2;
				654	*inlen = processed - in;
				655	return(-2);
				656	}
				657
				658	if (inend - in < trailing) {
				659	break;
				660	}
				661
				662	for ( ; trailing; trailing--) {
				663	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				664	break;
				665	c <<= 6;
				666	c \|= d & 0x3F;
				667	}
				668
				669	/* assertion: c is a single UTF-4 value */
				670	if (c < 0x10000) {
				671	if (out >= outend)
				672	break;
				673	if (xmlLittleEndian) {
				674	*out++ = c;
				675	} else {
				676	tmp = (unsigned char *) out;
				677	*tmp = c ;
				678	*(tmp + 1) = c >> 8 ;
				679	out++;
				680	}
				681	}
				682	else if (c < 0x110000) {
				683	if (out+1 >= outend)
				684	break;
				685	c -= 0x10000;
				686	if (xmlLittleEndian) {
				687	*out++ = 0xD800 \| (c >> 10);
				688	*out++ = 0xDC00 \| (c & 0x03FF);
				689	} else {
				690	tmp1 = 0xD800 \| (c >> 10);
				691	tmp = (unsigned char *) out;
				692	*tmp = (unsigned char) tmp1;
				693	*(tmp + 1) = tmp1 >> 8;
				694	out++;
				695
				696	tmp2 = 0xDC00 \| (c & 0x03FF);
				697	tmp = (unsigned char *) out;
				698	*tmp = (unsigned char) tmp2;
				699	*(tmp + 1) = tmp2 >> 8;
				700	out++;
				701	}
				702	}
				703	else
				704	break;
				705	processed = in;
				706	}
				707	outlen = (out - outstart) 2;
				708	*inlen = processed - in;
				709	return(0);
				710	}
				711
				712	/**
				713	* UTF16BEToUTF8:
				714	* @out: a pointer to an array of bytes to store the result
				715	* @outlen: the length of @out
				716	* @inb: a pointer to an array of UTF-16 passwd as a byte array
				717	* @inlenb: the length of @in in UTF-16 chars
				718	*
				719	* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
				720	* block of chars out. This function assume the endian properity
				721	* is the same between the native type of this machine and the
				722	* inputed one.
				723	*
				724	* Returns the number of byte written, or -1 by lack of space, or -2
				725	* if the transcoding fails (for *in is not valid utf16 string)
				726	* The value of *inlen after return is the number of octets consumed
				727	* as the return value is positive, else unpredictiable.
				728	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	729	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	730	UTF16BEToUTF8(unsigned char* out, int *outlen,
				731	const unsigned char* inb, int *inlenb)
				732	{
				733	unsigned char* outstart = out;
				734	const unsigned char* processed = inb;
				735	unsigned char* outend = out + *outlen;
				736	unsigned short* in = (unsigned short*) inb;
				737	unsigned short* inend;
				738	unsigned int c, d, inlen;
				739	unsigned char *tmp;
				740	int bits;
				741
				742	if ((*inlenb % 2) == 1)
				743	(*inlenb)--;
				744	inlen = *inlenb / 2;
				745	inend= in + inlen;
				746	while (in < inend) {
				747	if (xmlLittleEndian) {
				748	tmp = (unsigned char *) in;
				749	c = *tmp++;
				750	c = c << 8;
				751	c = c \| (unsigned int) *tmp;
				752	in++;
				753	} else {
				754	c= *in++;
				755	}
				756	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				757	if (in >= inend) { /* (in > inend) shouldn't happens */
				758	*outlen = out - outstart;
				759	*inlenb = processed - inb;
				760	return(-2);
				761	}
				762	if (xmlLittleEndian) {
				763	tmp = (unsigned char *) in;
				764	d = *tmp++;
				765	d = d << 8;
				766	d = d \| (unsigned int) *tmp;
				767	in++;
				768	} else {
				769	d= *in++;
				770	}
				771	if ((d & 0xFC00) == 0xDC00) {
				772	c &= 0x03FF;
				773	c <<= 10;
				774	c \|= d & 0x03FF;
				775	c += 0x10000;
				776	}
				777	else {
				778	*outlen = out - outstart;
				779	*inlenb = processed - inb;
				780	return(-2);
				781	}
				782	}
				783
				784	/* assertion: c is a single UTF-4 value */
				785	if (out >= outend)
				786	break;
				787	if (c < 0x80) { *out++= c; bits= -6; }
				788	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				789	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				790	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				791
				792	for ( ; bits >= 0; bits-= 6) {
				793	if (out >= outend)
				794	break;
				795	*out++= ((c >> bits) & 0x3F) \| 0x80;
				796	}
				797	processed = (const unsigned char*) in;
				798	}
				799	*outlen = out - outstart;
				800	*inlenb = processed - inb;
				801	return(0);
				802	}
				803
				804	/**
				805	* UTF8ToUTF16BE:
				806	* @outb: a pointer to an array of bytes to store the result
				807	* @outlen: the length of @outb
				808	* @in: a pointer to an array of UTF-8 chars
				809	* @inlen: the length of @in
				810	*
				811	* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
				812	* block of chars out.
				813	*
				814	* Returns the number of byte written, or -1 by lack of space, or -2
				815	* if the transcoding failed.
				816	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	817	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	818	UTF8ToUTF16BE(unsigned char* outb, int *outlen,
				819	const unsigned char* in, int *inlen)
				820	{
				821	unsigned short* out = (unsigned short*) outb;
				822	const unsigned char* processed = in;
				823	unsigned short* outstart= out;
				824	unsigned short* outend;
				825	const unsigned char* inend= in+*inlen;
				826	unsigned int c, d;
				827	int trailing;
				828	unsigned char *tmp;
				829	unsigned short tmp1, tmp2;
				830
				831	if (in == NULL) {
				832	/*
				833	* initialization, add the Byte Order Mark
				834	*/
				835	if (*outlen >= 2) {
				836	outb[0] = 0xFE;
				837	outb[1] = 0xFF;
				838	*outlen = 2;
				839	*inlen = 0;
				840	#ifdef DEBUG_ENCODING
				841	xmlGenericError(xmlGenericErrorContext,
				842	"Added FEFF Byte Order Mark\n");
				843	#endif
				844	return(2);
				845	}
				846	*outlen = 0;
				847	*inlen = 0;
				848	return(0);
				849	}
				850	outend = out + (*outlen / 2);
				851	while (in < inend) {
				852	d= *in++;
				853	if (d < 0x80) { c= d; trailing= 0; }
				854	else if (d < 0xC0) {
				855	/* trailing byte in leading position */
				856	*outlen = out - outstart;
				857	*inlen = processed - in;
				858	return(-2);
				859	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				860	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				861	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				862	else {
				863	/* no chance for this in UTF-16 */
				864	*outlen = out - outstart;
				865	*inlen = processed - in;
				866	return(-2);
				867	}
				868
				869	if (inend - in < trailing) {
				870	break;
				871	}
				872
				873	for ( ; trailing; trailing--) {
				874	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80)) break;
				875	c <<= 6;
				876	c \|= d & 0x3F;
				877	}
				878
				879	/* assertion: c is a single UTF-4 value */
				880	if (c < 0x10000) {
				881	if (out >= outend) break;
				882	if (xmlLittleEndian) {
				883	tmp = (unsigned char *) out;
				884	*tmp = c >> 8;
				885	*(tmp + 1) = c;
				886	out++;
				887	} else {
				888	*out++ = c;
				889	}
				890	}
				891	else if (c < 0x110000) {
				892	if (out+1 >= outend) break;
				893	c -= 0x10000;
				894	if (xmlLittleEndian) {
				895	tmp1 = 0xD800 \| (c >> 10);
				896	tmp = (unsigned char *) out;
				897	*tmp = tmp1 >> 8;
				898	*(tmp + 1) = (unsigned char) tmp1;
				899	out++;
				900
				901	tmp2 = 0xDC00 \| (c & 0x03FF);
				902	tmp = (unsigned char *) out;
				903	*tmp = tmp2 >> 8;
				904	*(tmp + 1) = (unsigned char) tmp2;
				905	out++;
				906	} else {
				907	*out++ = 0xD800 \| (c >> 10);
				908	*out++ = 0xDC00 \| (c & 0x03FF);
				909	}
				910	}
				911	else
				912	break;
				913	processed = in;
				914	}
				915	outlen = (out - outstart) 2;
				916	*inlen = processed - in;
				917	return(0);
				918	}
				919
				920	/**
				921	* xmlDetectCharEncoding:
				922	* @in: a pointer to the first bytes of the XML entity, must be at least
				923	* 4 bytes long.
				924	* @len: pointer to the length of the buffer
				925	*
				926	* Guess the encoding of the entity using the first bytes of the entity content
				927	* accordingly of the non-normative appendix F of the XML-1.0 recommendation.
				928	*
				929	* Returns one of the XML_CHAR_ENCODING_... values.
				930	*/
				931	xmlCharEncoding
				932	xmlDetectCharEncoding(const unsigned char* in, int len)
				933	{
				934	if (len >= 4) {
				935	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				936	(in[2] == 0x00) && (in[3] == 0x3C))
				937	return(XML_CHAR_ENCODING_UCS4BE);
				938	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
				939	(in[2] == 0x00) && (in[3] == 0x00))
				940	return(XML_CHAR_ENCODING_UCS4LE);
				941	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				942	(in[2] == 0x3C) && (in[3] == 0x00))
				943	return(XML_CHAR_ENCODING_UCS4_2143);
				944	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
				945	(in[2] == 0x00) && (in[3] == 0x00))
				946	return(XML_CHAR_ENCODING_UCS4_3412);
				947	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
				948	(in[2] == 0xA7) && (in[3] == 0x94))
				949	return(XML_CHAR_ENCODING_EBCDIC);
				950	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
				951	(in[2] == 0x78) && (in[3] == 0x6D))
				952	return(XML_CHAR_ENCODING_UTF8);
				953	}
				954	if (len >= 2) {
				955	if ((in[0] == 0xFE) && (in[1] == 0xFF))
				956	return(XML_CHAR_ENCODING_UTF16BE);
				957	if ((in[0] == 0xFF) && (in[1] == 0xFE))
				958	return(XML_CHAR_ENCODING_UTF16LE);
				959	}
				960	return(XML_CHAR_ENCODING_NONE);
				961	}
				962
				963	/**
				964	* xmlCleanupEncodingAliases:
				965	*
				966	* Unregisters all aliases
				967	*/
				968	void
				969	xmlCleanupEncodingAliases(void) {
				970	int i;
				971
				972	if (xmlCharEncodingAliases == NULL)
				973	return;
				974
				975	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				976	if (xmlCharEncodingAliases[i].name != NULL)
				977	xmlFree((char *) xmlCharEncodingAliases[i].name);
				978	if (xmlCharEncodingAliases[i].alias != NULL)
				979	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				980	}
				981	xmlCharEncodingAliasesNb = 0;
				982	xmlCharEncodingAliasesMax = 0;
				983	xmlFree(xmlCharEncodingAliases);
				984	}
				985
				986	/**
				987	* xmlGetEncodingAlias:
				988	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				989	*
				990	* Lookup an encoding name for the given alias.
				991	*
				992	* Returns NULL if not found the original name otherwise
				993	*/
				994	const char *
				995	xmlGetEncodingAlias(const char *alias) {
				996	int i;
				997	char upper[100];
				998
				999	if (alias == NULL)
				1000	return(NULL);
				1001
				1002	if (xmlCharEncodingAliases == NULL)
				1003	return(NULL);
				1004
				1005	for (i = 0;i < 99;i++) {
				1006	upper[i] = toupper(alias[i]);
				1007	if (upper[i] == 0) break;
				1008	}
				1009	upper[i] = 0;
				1010
				1011	/*
				1012	* Walk down the list looking for a definition of the alias
				1013	*/
				1014	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1015	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1016	return(xmlCharEncodingAliases[i].name);
				1017	}
				1018	}
				1019	return(NULL);
				1020	}
				1021
				1022	/**
				1023	* xmlAddEncodingAlias:
				1024	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1025	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1026	*
				1027	* Registers and alias @alias for an encoding named @name. Existing alias
				1028	* will be overwritten.
				1029	*
				1030	* Returns 0 in case of success, -1 in case of error
				1031	*/
				1032	int
				1033	xmlAddEncodingAlias(const char name, const char alias) {
				1034	int i;
				1035	char upper[100];
				1036
				1037	if ((name == NULL) \|\| (alias == NULL))
				1038	return(-1);
				1039
				1040	for (i = 0;i < 99;i++) {
				1041	upper[i] = toupper(alias[i]);
				1042	if (upper[i] == 0) break;
				1043	}
				1044	upper[i] = 0;
				1045
				1046	if (xmlCharEncodingAliases == NULL) {
				1047	xmlCharEncodingAliasesNb = 0;
				1048	xmlCharEncodingAliasesMax = 20;
				1049	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1050	xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1051	if (xmlCharEncodingAliases == NULL)
				1052	return(-1);
				1053	} else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
				1054	xmlCharEncodingAliasesMax *= 2;
				1055	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1056	xmlRealloc(xmlCharEncodingAliases,
				1057	xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1058	}
				1059	/*
				1060	* Walk down the list looking for a definition of the alias
				1061	*/
				1062	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1063	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1064	/*
				1065	* Replace the definition.
				1066	*/
				1067	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1068	xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
				1069	return(0);
				1070	}
				1071	}
				1072	/*
				1073	* Add the definition
				1074	*/
				1075	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
				1076	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
				1077	xmlCharEncodingAliasesNb++;
				1078	return(0);
				1079	}
				1080
				1081	/**
				1082	* xmlDelEncodingAlias:
				1083	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1084	*
				1085	* Unregisters an encoding alias @alias
				1086	*
				1087	* Returns 0 in case of success, -1 in case of error
				1088	*/
				1089	int
				1090	xmlDelEncodingAlias(const char *alias) {
				1091	int i;
				1092
				1093	if (alias == NULL)
				1094	return(-1);
				1095
				1096	if (xmlCharEncodingAliases == NULL)
				1097	return(-1);
				1098	/*
				1099	* Walk down the list looking for a definition of the alias
				1100	*/
				1101	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1102	if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
				1103	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1104	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				1105	xmlCharEncodingAliasesNb--;
				1106	memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
				1107	sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
				1108	return(0);
				1109	}
				1110	}
				1111	return(-1);
				1112	}
				1113
				1114	/**
				1115	* xmlParseCharEncoding:
				1116	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1117	*
				1118	* Conpare the string to the known encoding schemes already known. Note
				1119	* that the comparison is case insensitive accordingly to the section
				1120	* [XML] 4.3.3 Character Encoding in Entities.
				1121	*
				1122	* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
				1123	* if not recognized.
				1124	*/
				1125	xmlCharEncoding
				1126	xmlParseCharEncoding(const char* name)
				1127	{
				1128	const char *alias;
				1129	char upper[500];
				1130	int i;
				1131
				1132	if (name == NULL)
				1133	return(XML_CHAR_ENCODING_NONE);
				1134
				1135	/*
				1136	* Do the alias resolution
				1137	*/
				1138	alias = xmlGetEncodingAlias(name);
				1139	if (alias != NULL)
				1140	name = alias;
				1141
				1142	for (i = 0;i < 499;i++) {
				1143	upper[i] = toupper(name[i]);
				1144	if (upper[i] == 0) break;
				1145	}
				1146	upper[i] = 0;
				1147
				1148	if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
				1149	if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
				1150	if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
				1151
				1152	/*
				1153	* NOTE: if we were able to parse this, the endianness of UTF16 is
				1154	* already found and in use
				1155	*/
				1156	if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
				1157	if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
				1158
				1159	if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1160	if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1161	if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
				1162
				1163	/*
				1164	* NOTE: if we were able to parse this, the endianness of UCS4 is
				1165	* already found and in use
				1166	*/
				1167	if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1168	if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1169	if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
				1170
				1171
				1172	if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
				1173	if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
				1174	if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
				1175
				1176	if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
				1177	if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
				1178	if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
				1179
				1180	if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
				1181	if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
				1182	if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
				1183	if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
				1184	if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
				1185	if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
				1186	if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
				1187
				1188	if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
				1189	if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
				1190	if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
				1191
				1192	#ifdef DEBUG_ENCODING
				1193	xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
				1194	#endif
				1195	return(XML_CHAR_ENCODING_ERROR);
				1196	}
				1197
				1198	/**
				1199	* xmlGetCharEncodingName:
				1200	* @enc: the encoding
				1201	*
				1202	* The "canonical" name for XML encoding.
				1203	* C.f. http://www.w3.org/TR/REC-xml#charencoding
				1204	* Section 4.3.3 Character Encoding in Entities
				1205	*
				1206	* Returns the canonical name for the given encoding
				1207	*/
				1208
				1209	const char*
				1210	xmlGetCharEncodingName(xmlCharEncoding enc) {
				1211	switch (enc) {
				1212	case XML_CHAR_ENCODING_ERROR:
				1213	return(NULL);
				1214	case XML_CHAR_ENCODING_NONE:
				1215	return(NULL);
				1216	case XML_CHAR_ENCODING_UTF8:
				1217	return("UTF-8");
				1218	case XML_CHAR_ENCODING_UTF16LE:
				1219	return("UTF-16");
				1220	case XML_CHAR_ENCODING_UTF16BE:
				1221	return("UTF-16");
				1222	case XML_CHAR_ENCODING_EBCDIC:
				1223	return("EBCDIC");
				1224	case XML_CHAR_ENCODING_UCS4LE:
				1225	return("ISO-10646-UCS-4");
				1226	case XML_CHAR_ENCODING_UCS4BE:
				1227	return("ISO-10646-UCS-4");
				1228	case XML_CHAR_ENCODING_UCS4_2143:
				1229	return("ISO-10646-UCS-4");
				1230	case XML_CHAR_ENCODING_UCS4_3412:
				1231	return("ISO-10646-UCS-4");
				1232	case XML_CHAR_ENCODING_UCS2:
				1233	return("ISO-10646-UCS-2");
				1234	case XML_CHAR_ENCODING_8859_1:
				1235	return("ISO-8859-1");
				1236	case XML_CHAR_ENCODING_8859_2:
				1237	return("ISO-8859-2");
				1238	case XML_CHAR_ENCODING_8859_3:
				1239	return("ISO-8859-3");
				1240	case XML_CHAR_ENCODING_8859_4:
				1241	return("ISO-8859-4");
				1242	case XML_CHAR_ENCODING_8859_5:
				1243	return("ISO-8859-5");
				1244	case XML_CHAR_ENCODING_8859_6:
				1245	return("ISO-8859-6");
				1246	case XML_CHAR_ENCODING_8859_7:
				1247	return("ISO-8859-7");
				1248	case XML_CHAR_ENCODING_8859_8:
				1249	return("ISO-8859-8");
				1250	case XML_CHAR_ENCODING_8859_9:
				1251	return("ISO-8859-9");
				1252	case XML_CHAR_ENCODING_2022_JP:
				1253	return("ISO-2022-JP");
				1254	case XML_CHAR_ENCODING_SHIFT_JIS:
				1255	return("Shift-JIS");
				1256	case XML_CHAR_ENCODING_EUC_JP:
				1257	return("EUC-JP");
				1258	case XML_CHAR_ENCODING_ASCII:
				1259	return(NULL);
				1260	}
				1261	return(NULL);
				1262	}
				1263
				1264	/****************************************************************
				1265	* *
				1266	* Char encoding handlers *
				1267	* *
				1268	****************************************************************/
				1269
				1270	/* the size should be growable, but it's not a big deal ... */
				1271	#define MAX_ENCODING_HANDLERS 50
				1272	static xmlCharEncodingHandlerPtr *handlers = NULL;
				1273	static int nbCharEncodingHandler = 0;
				1274
				1275	/*
				1276	* The default is UTF-8 for XML, that's also the default used for the
				1277	* parser internals, so the default encoding handler is NULL
				1278	*/
				1279
				1280	static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
				1281
				1282	/**
				1283	* xmlNewCharEncodingHandler:
				1284	* @name: the encoding name, in UTF-8 format (ASCII actually)
				1285	* @input: the xmlCharEncodingInputFunc to read that encoding
				1286	* @output: the xmlCharEncodingOutputFunc to write that encoding
				1287	*
				1288	* Create and registers an xmlCharEncodingHandler.
				1289	* Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
				1290	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1291	static xmlCharEncodingHandlerPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1292	xmlNewCharEncodingHandler(const char *name,
				1293	xmlCharEncodingInputFunc input,
				1294	xmlCharEncodingOutputFunc output) {
				1295	xmlCharEncodingHandlerPtr handler;
				1296	const char *alias;
				1297	char upper[500];
				1298	int i;
				1299	char *up = 0;
				1300
				1301	/*
				1302	* Do the alias resolution
				1303	*/
				1304	alias = xmlGetEncodingAlias(name);
				1305	if (alias != NULL)
				1306	name = alias;
				1307
				1308	/*
				1309	* Keep only the uppercase version of the encoding.
				1310	*/
				1311	if (name == NULL) {
				1312	xmlGenericError(xmlGenericErrorContext,
				1313	"xmlNewCharEncodingHandler : no name !\n");
				1314	return(NULL);
				1315	}
				1316	for (i = 0;i < 499;i++) {
				1317	upper[i] = toupper(name[i]);
				1318	if (upper[i] == 0) break;
				1319	}
				1320	upper[i] = 0;
				1321	up = xmlMemStrdup(upper);
				1322	if (up == NULL) {
				1323	xmlGenericError(xmlGenericErrorContext,
				1324	"xmlNewCharEncodingHandler : out of memory !\n");
				1325	return(NULL);
				1326	}
				1327
				1328	/*
				1329	* allocate and fill-up an handler block.
				1330	*/
				1331	handler = (xmlCharEncodingHandlerPtr)
				1332	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1333	if (handler == NULL) {
				1334	xmlGenericError(xmlGenericErrorContext,
				1335	"xmlNewCharEncodingHandler : out of memory !\n");
				1336	return(NULL);
				1337	}
				1338	handler->input = input;
				1339	handler->output = output;
				1340	handler->name = up;
				1341
				1342	#ifdef LIBXML_ICONV_ENABLED
				1343	handler->iconv_in = NULL;
				1344	handler->iconv_out = NULL;
				1345	#endif /* LIBXML_ICONV_ENABLED */
				1346
				1347	/*
				1348	* registers and returns the handler.
				1349	*/
				1350	xmlRegisterCharEncodingHandler(handler);
				1351	#ifdef DEBUG_ENCODING
				1352	xmlGenericError(xmlGenericErrorContext,
				1353	"Registered encoding handler for %s\n", name);
				1354	#endif
				1355	return(handler);
				1356	}
				1357
				1358	/**
				1359	* xmlInitCharEncodingHandlers:
				1360	*
				1361	* Initialize the char encoding support, it registers the default
				1362	* encoding supported.
				1363	* NOTE: while public, this function usually doesn't need to be called
				1364	* in normal processing.
				1365	*/
				1366	void
				1367	xmlInitCharEncodingHandlers(void) {
				1368	unsigned short int tst = 0x1234;
				1369	unsigned char ptr = (unsigned char ) &tst;
				1370
				1371	if (handlers != NULL) return;
				1372
				1373	handlers = (xmlCharEncodingHandlerPtr *)
				1374	xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
				1375
				1376	if (*ptr == 0x12) xmlLittleEndian = 0;
				1377	else if (*ptr == 0x34) xmlLittleEndian = 1;
				1378	else xmlGenericError(xmlGenericErrorContext,
				1379	"Odd problem at endianness detection\n");
				1380
				1381	if (handlers == NULL) {
				1382	xmlGenericError(xmlGenericErrorContext,
				1383	"xmlInitCharEncodingHandlers : out of memory !\n");
				1384	return;
				1385	}
				1386	xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
				1387	xmlUTF16LEHandler =
				1388	xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
				1389	xmlUTF16BEHandler =
				1390	xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
				1391	xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
				1392	xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
				1393	#ifdef LIBXML_HTML_ENABLED
				1394	xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
				1395	#endif
				1396	}
				1397
				1398	/**
				1399	* xmlCleanupCharEncodingHandlers:
				1400	*
				1401	* Cleanup the memory allocated for the char encoding support, it
				1402	* unregisters all the encoding handlers and the aliases.
				1403	*/
				1404	void
				1405	xmlCleanupCharEncodingHandlers(void) {
				1406	xmlCleanupEncodingAliases();
				1407
				1408	if (handlers == NULL) return;
				1409
				1410	for (;nbCharEncodingHandler > 0;) {
				1411	nbCharEncodingHandler--;
				1412	if (handlers[nbCharEncodingHandler] != NULL) {
				1413	if (handlers[nbCharEncodingHandler]->name != NULL)
				1414	xmlFree(handlers[nbCharEncodingHandler]->name);
				1415	xmlFree(handlers[nbCharEncodingHandler]);
				1416	}
				1417	}
				1418	xmlFree(handlers);
				1419	handlers = NULL;
				1420	nbCharEncodingHandler = 0;
				1421	xmlDefaultCharEncodingHandler = NULL;
				1422	}
				1423
				1424	/**
				1425	* xmlRegisterCharEncodingHandler:
				1426	* @handler: the xmlCharEncodingHandlerPtr handler block
				1427	*
				1428	* Register the char encoding handler, surprizing, isn't it ?
				1429	*/
				1430	void
				1431	xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
				1432	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1433	if (handler == NULL) {
				1434	xmlGenericError(xmlGenericErrorContext,
				1435	"xmlRegisterCharEncodingHandler: NULL handler !\n");
				1436	return;
				1437	}
				1438
				1439	if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
				1440	xmlGenericError(xmlGenericErrorContext,
				1441	"xmlRegisterCharEncodingHandler: Too many handler registered\n");
				1442	xmlGenericError(xmlGenericErrorContext,
				1443	"\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
				1444	return;
				1445	}
				1446	handlers[nbCharEncodingHandler++] = handler;
				1447	}
				1448
				1449	/**
				1450	* xmlGetCharEncodingHandler:
				1451	* @enc: an xmlCharEncoding value.
				1452	*
				1453	* Search in the registrered set the handler able to read/write that encoding.
				1454	*
				1455	* Returns the handler or NULL if not found
				1456	*/
				1457	xmlCharEncodingHandlerPtr
				1458	xmlGetCharEncodingHandler(xmlCharEncoding enc) {
				1459	xmlCharEncodingHandlerPtr handler;
				1460
				1461	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1462	switch (enc) {
				1463	case XML_CHAR_ENCODING_ERROR:
				1464	return(NULL);
				1465	case XML_CHAR_ENCODING_NONE:
				1466	return(NULL);
				1467	case XML_CHAR_ENCODING_UTF8:
				1468	return(NULL);
				1469	case XML_CHAR_ENCODING_UTF16LE:
				1470	return(xmlUTF16LEHandler);
				1471	case XML_CHAR_ENCODING_UTF16BE:
				1472	return(xmlUTF16BEHandler);
				1473	case XML_CHAR_ENCODING_EBCDIC:
				1474	handler = xmlFindCharEncodingHandler("EBCDIC");
				1475	if (handler != NULL) return(handler);
				1476	handler = xmlFindCharEncodingHandler("ebcdic");
				1477	if (handler != NULL) return(handler);
				1478	break;
				1479	case XML_CHAR_ENCODING_UCS4BE:
				1480	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1481	if (handler != NULL) return(handler);
				1482	handler = xmlFindCharEncodingHandler("UCS-4");
				1483	if (handler != NULL) return(handler);
				1484	handler = xmlFindCharEncodingHandler("UCS4");
				1485	if (handler != NULL) return(handler);
				1486	break;
				1487	case XML_CHAR_ENCODING_UCS4LE:
				1488	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1489	if (handler != NULL) return(handler);
				1490	handler = xmlFindCharEncodingHandler("UCS-4");
				1491	if (handler != NULL) return(handler);
				1492	handler = xmlFindCharEncodingHandler("UCS4");
				1493	if (handler != NULL) return(handler);
				1494	break;
				1495	case XML_CHAR_ENCODING_UCS4_2143:
				1496	break;
				1497	case XML_CHAR_ENCODING_UCS4_3412:
				1498	break;
				1499	case XML_CHAR_ENCODING_UCS2:
				1500	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
				1501	if (handler != NULL) return(handler);
				1502	handler = xmlFindCharEncodingHandler("UCS-2");
				1503	if (handler != NULL) return(handler);
				1504	handler = xmlFindCharEncodingHandler("UCS2");
				1505	if (handler != NULL) return(handler);
				1506	break;
				1507
				1508	/*
				1509	* We used to keep ISO Latin encodings native in the
				1510	* generated data. This led to so many problems that
				1511	* this has been removed. One can still change this
				1512	* back by registering no-ops encoders for those
				1513	*/
				1514	case XML_CHAR_ENCODING_8859_1:
				1515	handler = xmlFindCharEncodingHandler("ISO-8859-1");
				1516	if (handler != NULL) return(handler);
				1517	break;
				1518	case XML_CHAR_ENCODING_8859_2:
				1519	handler = xmlFindCharEncodingHandler("ISO-8859-2");
				1520	if (handler != NULL) return(handler);
				1521	break;
				1522	case XML_CHAR_ENCODING_8859_3:
				1523	handler = xmlFindCharEncodingHandler("ISO-8859-3");
				1524	if (handler != NULL) return(handler);
				1525	break;
				1526	case XML_CHAR_ENCODING_8859_4:
				1527	handler = xmlFindCharEncodingHandler("ISO-8859-4");
				1528	if (handler != NULL) return(handler);
				1529	break;
				1530	case XML_CHAR_ENCODING_8859_5:
				1531	handler = xmlFindCharEncodingHandler("ISO-8859-5");
				1532	if (handler != NULL) return(handler);
				1533	break;
				1534	case XML_CHAR_ENCODING_8859_6:
				1535	handler = xmlFindCharEncodingHandler("ISO-8859-6");
				1536	if (handler != NULL) return(handler);
				1537	break;
				1538	case XML_CHAR_ENCODING_8859_7:
				1539	handler = xmlFindCharEncodingHandler("ISO-8859-7");
				1540	if (handler != NULL) return(handler);
				1541	break;
				1542	case XML_CHAR_ENCODING_8859_8:
				1543	handler = xmlFindCharEncodingHandler("ISO-8859-8");
				1544	if (handler != NULL) return(handler);
				1545	break;
				1546	case XML_CHAR_ENCODING_8859_9:
				1547	handler = xmlFindCharEncodingHandler("ISO-8859-9");
				1548	if (handler != NULL) return(handler);
				1549	break;
				1550
				1551
				1552	case XML_CHAR_ENCODING_2022_JP:
				1553	handler = xmlFindCharEncodingHandler("ISO-2022-JP");
				1554	if (handler != NULL) return(handler);
				1555	break;
				1556	case XML_CHAR_ENCODING_SHIFT_JIS:
				1557	handler = xmlFindCharEncodingHandler("SHIFT-JIS");
				1558	if (handler != NULL) return(handler);
				1559	handler = xmlFindCharEncodingHandler("SHIFT_JIS");
				1560	if (handler != NULL) return(handler);
				1561	handler = xmlFindCharEncodingHandler("Shift_JIS");
				1562	if (handler != NULL) return(handler);
				1563	break;
				1564	case XML_CHAR_ENCODING_EUC_JP:
				1565	handler = xmlFindCharEncodingHandler("EUC-JP");
				1566	if (handler != NULL) return(handler);
				1567	break;
				1568	default:
				1569	break;
				1570	}
				1571
				1572	#ifdef DEBUG_ENCODING
				1573	xmlGenericError(xmlGenericErrorContext,
				1574	"No handler found for encoding %d\n", enc);
				1575	#endif
				1576	return(NULL);
				1577	}
				1578
				1579	/**
				1580	* xmlGetCharEncodingHandler:
				1581	* @enc: a string describing the char encoding.
				1582	*
				1583	* Search in the registrered set the handler able to read/write that encoding.
				1584	*
				1585	* Returns the handler or NULL if not found
				1586	*/
				1587	xmlCharEncodingHandlerPtr
				1588	xmlFindCharEncodingHandler(const char *name) {
				1589	const char *nalias;
				1590	const char *norig;
				1591	xmlCharEncoding alias;
				1592	#ifdef LIBXML_ICONV_ENABLED
				1593	xmlCharEncodingHandlerPtr enc;
				1594	iconv_t icv_in, icv_out;
				1595	#endif /* LIBXML_ICONV_ENABLED */
				1596	char upper[100];
				1597	int i;
				1598
				1599	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1600	if (name == NULL) return(xmlDefaultCharEncodingHandler);
				1601	if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
				1602
				1603	/*
				1604	* Do the alias resolution
				1605	*/
				1606	norig = name;
				1607	nalias = xmlGetEncodingAlias(name);
				1608	if (nalias != NULL)
				1609	name = nalias;
				1610
				1611	/*
				1612	* Check first for directly registered encoding names
				1613	*/
				1614	for (i = 0;i < 99;i++) {
				1615	upper[i] = toupper(name[i]);
				1616	if (upper[i] == 0) break;
				1617	}
				1618	upper[i] = 0;
				1619
				1620	for (i = 0;i < nbCharEncodingHandler; i++)
				1621	if (!strcmp(upper, handlers[i]->name)) {
				1622	#ifdef DEBUG_ENCODING
				1623	xmlGenericError(xmlGenericErrorContext,
				1624	"Found registered handler for encoding %s\n", name);
				1625	#endif
				1626	return(handlers[i]);
				1627	}
				1628
				1629	#ifdef LIBXML_ICONV_ENABLED
				1630	/* check whether iconv can handle this */
				1631	icv_in = iconv_open("UTF-8", name);
				1632	icv_out = iconv_open(name, "UTF-8");
				1633	if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
				1634	enc = (xmlCharEncodingHandlerPtr)
				1635	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1636	if (enc == NULL) {
				1637	iconv_close(icv_in);
				1638	iconv_close(icv_out);
				1639	return(NULL);
				1640	}
				1641	enc->name = xmlMemStrdup(name);
				1642	enc->input = NULL;
				1643	enc->output = NULL;
				1644	enc->iconv_in = icv_in;
				1645	enc->iconv_out = icv_out;
				1646	#ifdef DEBUG_ENCODING
				1647	xmlGenericError(xmlGenericErrorContext,
				1648	"Found iconv handler for encoding %s\n", name);
				1649	#endif
				1650	return enc;
				1651	} else if ((icv_in != (iconv_t) -1) \|\| icv_out != (iconv_t) -1) {
				1652	xmlGenericError(xmlGenericErrorContext,
				1653	"iconv : problems with filters for '%s'\n", name);
				1654	}
				1655	#endif /* LIBXML_ICONV_ENABLED */
				1656
				1657	#ifdef DEBUG_ENCODING
				1658	xmlGenericError(xmlGenericErrorContext,
				1659	"No handler found for encoding %s\n", name);
				1660	#endif
				1661
				1662	/*
				1663	* Fallback using the canonical names
				1664	*/
				1665	alias = xmlParseCharEncoding(norig);
				1666	if (alias != XML_CHAR_ENCODING_ERROR) {
				1667	const char* canon;
				1668	canon = xmlGetCharEncodingName(alias);
				1669	if ((canon != NULL) && (strcmp(name, canon))) {
				1670	return(xmlFindCharEncodingHandler(canon));
				1671	}
				1672	}
				1673
				1674	return(NULL);
				1675	}
				1676
				1677	#ifdef LIBXML_ICONV_ENABLED
				1678	/**
				1679	* xmlIconvWrapper:
				1680	* @cd: iconv converter data structure
				1681	* @out: a pointer to an array of bytes to store the result
				1682	* @outlen: the length of @out
				1683	* @in: a pointer to an array of ISO Latin 1 chars
				1684	* @inlen: the length of @in
				1685	*
				1686	* Returns 0 if success, or
				1687	* -1 by lack of space, or
				1688	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1689	* the result of transformation can't fit into the encoding we want), or
				1690	* -3 if there the last byte can't form a single output char.
				1691	*
				1692	* The value of @inlen after return is the number of octets consumed
				1693	* as the return value is positive, else unpredictiable.
				1694	* The value of @outlen after return is the number of ocetes consumed.
				1695	*/
				1696	static int
				1697	xmlIconvWrapper(iconv_t cd,
				1698	unsigned char out, int outlen,
				1699	const unsigned char in, int inlen) {
				1700
				1701	size_t icv_inlen = inlen, icv_outlen = outlen;
				1702	const char icv_in = (const char ) in;
				1703	char icv_out = (char ) out;
				1704	int ret;
				1705
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1706	ret = iconv(cd, &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1707	if (in != NULL) {
				1708	*inlen -= icv_inlen;
				1709	*outlen -= icv_outlen;
				1710	} else {
				1711	*inlen = 0;
				1712	*outlen = 0;
				1713	}
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1714	if ((icv_inlen != 0) \|\| (ret == -1)) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1715	#ifdef EILSEQ
				1716	if (errno == EILSEQ) {
				1717	return -2;
				1718	} else
				1719	#endif
				1720	#ifdef E2BIG
				1721	if (errno == E2BIG) {
				1722	return -1;
				1723	} else
				1724	#endif
				1725	#ifdef EINVAL
				1726	if (errno == EINVAL) {
				1727	return -3;
				1728	} else
				1729	#endif
				1730	{
				1731	return -3;
				1732	}
				1733	}
				1734	return 0;
				1735	}
				1736	#endif /* LIBXML_ICONV_ENABLED */
				1737
				1738	/**
				1739	* xmlCharEncFirstLine:
				1740	* @handler: char enconding transformation data structure
				1741	* @out: an xmlBuffer for the output.
				1742	* @in: an xmlBuffer for the input
				1743	*
				1744	* Front-end for the encoding handler input function, but handle only
				1745	* the very first line, i.e. limit itself to 45 chars.
				1746	*
				1747	* Returns the number of byte written if success, or
				1748	* -1 general error
				1749	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1750	* the result of transformation can't fit into the encoding we want), or
				1751	*/
				1752	int
				1753	xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1754	xmlBufferPtr in) {
				1755	int ret = -2;
				1756	int written;
				1757	int toconv;
				1758
				1759	if (handler == NULL) return(-1);
				1760	if (out == NULL) return(-1);
				1761	if (in == NULL) return(-1);
				1762
				1763	written = out->size - out->use;
				1764	toconv = in->use;
				1765	if (toconv * 2 >= written) {
				1766	xmlBufferGrow(out, toconv);
				1767	written = out->size - out->use - 1;
				1768	}
				1769
				1770	/*
				1771	* echo '<?xml version="1.0" encoding="UCS4"?>' \| wc -c => 38
				1772	* 45 chars should be sufficient to reach the end of the encoding
				1773	* decalration without going too far inside the document content.
				1774	*/
				1775	written = 45;
				1776
				1777	if (handler->input != NULL) {
				1778	ret = handler->input(&out->content[out->use], &written,
				1779	in->content, &toconv);
				1780	xmlBufferShrink(in, toconv);
				1781	out->use += written;
				1782	out->content[out->use] = 0;
				1783	}
				1784	#ifdef LIBXML_ICONV_ENABLED
				1785	else if (handler->iconv_in != NULL) {
				1786	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				1787	&written, in->content, &toconv);
				1788	xmlBufferShrink(in, toconv);
				1789	out->use += written;
				1790	out->content[out->use] = 0;
				1791	if (ret == -1) ret = -3;
				1792	}
				1793	#endif /* LIBXML_ICONV_ENABLED */
				1794	#ifdef DEBUG_ENCODING
				1795	switch (ret) {
				1796	case 0:
				1797	xmlGenericError(xmlGenericErrorContext,
				1798	"converted %d bytes to %d bytes of input\n",
				1799	toconv, written);
				1800	break;
				1801	case -1:
				1802	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				1803	toconv, written, in->use);
				1804	break;
				1805	case -2:
				1806	xmlGenericError(xmlGenericErrorContext,
				1807	"input conversion failed due to input error\n");
				1808	break;
				1809	case -3:
				1810	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				1811	toconv, written, in->use);
				1812	break;
				1813	default:
				1814	xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
				1815	}
				1816	#endif
				1817	/*
				1818	* Ignore when input buffer is not on a boundary
				1819	*/
				1820	if (ret == -3) ret = 0;
				1821	if (ret == -1) ret = 0;
				1822	return(ret);
				1823	}
				1824
				1825	/**
				1826	* xmlCharEncInFunc:
				1827	* @handler: char enconding transformation data structure
				1828	* @out: an xmlBuffer for the output.
				1829	* @in: an xmlBuffer for the input
				1830	*
				1831	* Generic front-end for the encoding handler input function
				1832	*
				1833	* Returns the number of byte written if success, or
				1834	* -1 general error
				1835	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1836	* the result of transformation can't fit into the encoding we want), or
				1837	*/
				1838	int
				1839	xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1840	xmlBufferPtr in) {
				1841	int ret = -2;
				1842	int written;
				1843	int toconv;
				1844
				1845	if (handler == NULL) return(-1);
				1846	if (out == NULL) return(-1);
				1847	if (in == NULL) return(-1);
				1848
				1849	toconv = in->use;
				1850	if (toconv == 0)
				1851	return(0);
				1852	written = out->size - out->use;
				1853	if (toconv * 2 >= written) {
				1854	xmlBufferGrow(out, out->size + toconv * 2);
				1855	written = out->size - out->use - 1;
				1856	}
				1857	if (handler->input != NULL) {
				1858	ret = handler->input(&out->content[out->use], &written,
				1859	in->content, &toconv);
				1860	xmlBufferShrink(in, toconv);
				1861	out->use += written;
				1862	out->content[out->use] = 0;
				1863	}
				1864	#ifdef LIBXML_ICONV_ENABLED
				1865	else if (handler->iconv_in != NULL) {
				1866	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				1867	&written, in->content, &toconv);
				1868	xmlBufferShrink(in, toconv);
				1869	out->use += written;
				1870	out->content[out->use] = 0;
				1871	if (ret == -1) ret = -3;
				1872	}
				1873	#endif /* LIBXML_ICONV_ENABLED */
				1874	switch (ret) {
				1875	#ifdef DEBUG_ENCODING
				1876	case 0:
				1877	xmlGenericError(xmlGenericErrorContext,
				1878	"converted %d bytes to %d bytes of input\n",
				1879	toconv, written);
				1880	break;
				1881	case -1:
				1882	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				1883	toconv, written, in->use);
				1884	break;
				1885	case -3:
				1886	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				1887	toconv, written, in->use);
				1888	break;
				1889	#endif
				1890	case -2:
				1891	xmlGenericError(xmlGenericErrorContext,
				1892	"input conversion failed due to input error\n");
				1893	xmlGenericError(xmlGenericErrorContext,
				1894	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				1895	in->content[0], in->content[1],
				1896	in->content[2], in->content[3]);
				1897	}
				1898	/*
				1899	* Ignore when input buffer is not on a boundary
				1900	*/
				1901	if (ret == -3) ret = 0;
				1902	return(ret);
				1903	}
				1904
				1905	/**
				1906	* xmlCharEncOutFunc:
				1907	* @handler: char enconding transformation data structure
				1908	* @out: an xmlBuffer for the output.
				1909	* @in: an xmlBuffer for the input
				1910	*
				1911	* Generic front-end for the encoding handler output function
				1912	* a first call with @in == NULL has to be made firs to initiate the
				1913	* output in case of non-stateless encoding needing to initiate their
				1914	* state or the output (like the BOM in UTF16).
				1915	* In case of UTF8 sequence conversion errors for the given encoder,
				1916	* the content will be automatically remapped to a CharRef sequence.
				1917	*
				1918	* Returns the number of byte written if success, or
				1919	* -1 general error
				1920	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1921	* the result of transformation can't fit into the encoding we want), or
				1922	*/
				1923	int
				1924	xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1925	xmlBufferPtr in) {
				1926	int ret = -2;
				1927	int written;
				1928	int writtentot = 0;
				1929	int toconv;
				1930	int output = 0;
				1931
				1932	if (handler == NULL) return(-1);
				1933	if (out == NULL) return(-1);
				1934
				1935	retry:
				1936
				1937	written = out->size - out->use;
				1938
				1939	/*
				1940	* First specific handling of in = NULL, i.e. the initialization call
				1941	*/
				1942	if (in == NULL) {
				1943	toconv = 0;
				1944	if (handler->output != NULL) {
				1945	ret = handler->output(&out->content[out->use], &written,
				1946	NULL, &toconv);
				1947	out->use += written;
				1948	out->content[out->use] = 0;
				1949	}
				1950	#ifdef LIBXML_ICONV_ENABLED
				1951	else if (handler->iconv_out != NULL) {
				1952	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				1953	&written, NULL, &toconv);
				1954	out->use += written;
				1955	out->content[out->use] = 0;
				1956	}
				1957	#endif /* LIBXML_ICONV_ENABLED */
				1958	#ifdef DEBUG_ENCODING
				1959	xmlGenericError(xmlGenericErrorContext,
				1960	"initialized encoder\n");
				1961	#endif
				1962	return(0);
				1963	}
				1964
				1965	/*
				1966	* Convertion itself.
				1967	*/
				1968	toconv = in->use;
				1969	if (toconv == 0)
				1970	return(0);
				1971	if (toconv * 2 >= written) {
				1972	xmlBufferGrow(out, toconv * 2);
				1973	written = out->size - out->use - 1;
				1974	}
				1975	if (handler->output != NULL) {
				1976	ret = handler->output(&out->content[out->use], &written,
				1977	in->content, &toconv);
				1978	xmlBufferShrink(in, toconv);
				1979	out->use += written;
				1980	writtentot += written;
				1981	out->content[out->use] = 0;
				1982	}
				1983	#ifdef LIBXML_ICONV_ENABLED
				1984	else if (handler->iconv_out != NULL) {
				1985	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				1986	&written, in->content, &toconv);
				1987	xmlBufferShrink(in, toconv);
				1988	out->use += written;
				1989	writtentot += written;
				1990	out->content[out->use] = 0;
				1991	if (ret == -1) {
				1992	if (written > 0) {
				1993	/*
				1994	* Can be a limitation of iconv
				1995	*/
				1996	goto retry;
				1997	}
				1998	ret = -3;
				1999	}
				2000	}
				2001	#endif /* LIBXML_ICONV_ENABLED */
				2002	else {
				2003	xmlGenericError(xmlGenericErrorContext,
				2004	"xmlCharEncOutFunc: no output function !\n");
				2005	return(-1);
				2006	}
				2007
				2008	if (ret >= 0) output += ret;
				2009
				2010	/*
				2011	* Attempt to handle error cases
				2012	*/
				2013	switch (ret) {
				2014	#ifdef DEBUG_ENCODING
				2015	case 0:
				2016	xmlGenericError(xmlGenericErrorContext,
				2017	"converted %d bytes to %d bytes of output\n",
				2018	toconv, written);
				2019	break;
				2020	case -1:
				2021	xmlGenericError(xmlGenericErrorContext,
				2022	"output conversion failed by lack of space\n");
				2023	break;
				2024	#endif
				2025	case -3:
				2026	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
				2027	toconv, written, in->use);
				2028	break;
				2029	case -2: {
				2030	int len = in->use;
				2031	const xmlChar utf = (const xmlChar ) in->content;
				2032	int cur;
				2033
				2034	cur = xmlGetUTF8Char(utf, &len);
				2035	if (cur > 0) {
				2036	xmlChar charref[20];
				2037
				2038	#ifdef DEBUG_ENCODING
				2039	xmlGenericError(xmlGenericErrorContext,
				2040	"handling output conversion error\n");
				2041	xmlGenericError(xmlGenericErrorContext,
				2042	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2043	in->content[0], in->content[1],
				2044	in->content[2], in->content[3]);
				2045	#endif
				2046	/*
				2047	* Removes the UTF8 sequence, and replace it by a charref
				2048	* and continue the transcoding phase, hoping the error
				2049	* did not mangle the encoder state.
				2050	*/
				2051	sprintf((char *) charref, "&#x%X;", cur);
				2052	xmlBufferShrink(in, len);
				2053	xmlBufferAddHead(in, charref, -1);
				2054
				2055	goto retry;
				2056	} else {
				2057	xmlGenericError(xmlGenericErrorContext,
				2058	"output conversion failed due to conv error\n");
				2059	xmlGenericError(xmlGenericErrorContext,
				2060	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2061	in->content[0], in->content[1],
				2062	in->content[2], in->content[3]);
				2063	in->content[0] = ' ';
				2064	}
				2065	break;
				2066	}
				2067	}
				2068	return(ret);
				2069	}
				2070
				2071	/**
				2072	* xmlCharEncCloseFunc:
				2073	* @handler: char enconding transformation data structure
				2074	*
				2075	* Generic front-end for hencoding handler close function
				2076	*
				2077	* Returns 0 if success, or -1 in case of error
				2078	*/
				2079	int
				2080	xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
				2081	int ret = 0;
				2082	if (handler == NULL) return(-1);
				2083	if (handler->name == NULL) return(-1);
				2084	#ifdef LIBXML_ICONV_ENABLED
				2085	/*
				2086	* Iconv handlers can be oused only once, free the whole block.
				2087	* and the associated icon resources.
				2088	*/
				2089	if ((handler->iconv_out != NULL) \|\| (handler->iconv_in != NULL)) {
				2090	if (handler->name != NULL)
				2091	xmlFree(handler->name);
				2092	handler->name = NULL;
				2093	if (handler->iconv_out != NULL) {
				2094	if (iconv_close(handler->iconv_out))
				2095	ret = -1;
				2096	handler->iconv_out = NULL;
				2097	}
				2098	if (handler->iconv_in != NULL) {
				2099	if (iconv_close(handler->iconv_in))
				2100	ret = -1;
				2101	handler->iconv_in = NULL;
				2102	}
				2103	xmlFree(handler);
				2104	}
				2105	#endif /* LIBXML_ICONV_ENABLED */
				2106	#ifdef DEBUG_ENCODING
				2107	if (ret)
				2108	xmlGenericError(xmlGenericErrorContext,
				2109	"failed to close the encoding handler\n");
				2110	else
				2111	xmlGenericError(xmlGenericErrorContext,
				2112	"closed the encoding handler\n");
				2113
				2114	#endif
				2115	return(ret);
				2116	}
				2117