Blame - encoding.c - platform/external/libxml2

blob: de55e2e0f49cffed958c8a435916306e90c17230 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* encoding.c : implements the encoding conversion functions needed for XML
				3	*
				4	* Related specs:
				5	* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
				6	* rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
				7	* [ISO-10646] UTF-8 and UTF-16 in Annexes
				8	* [ISO-8859-1] ISO Latin-1 characters codes.
				9	* [UNICODE] The Unicode Consortium, "The Unicode Standard --
				10	* Worldwide Character Encoding -- Version 1.0", Addison-
				11	* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
				12	* described in Unicode Technical Report #4.
				13	* [US-ASCII] Coded Character Set--7-bit American Standard Code for
				14	* Information Interchange, ANSI X3.4-1986.
				15	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	16	* See Copyright for the status of this software.
				17	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	18	* daniel@veillard.com
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	19	*
				20	* UTF8 string routines from:
				21	* "William M. Brack" <wbrack@mmm.com.hk>
				22	*
				23	* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	24	*/
				25
Daniel Veillard	34ce8be	2002-03-18 19:37:11 +0000	[diff] [blame]	26	#define IN_LIBXML
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	27	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	28
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	29	#include <string.h>
				30
				31	#ifdef HAVE_CTYPE_H
				32	#include <ctype.h>
				33	#endif
				34	#ifdef HAVE_STDLIB_H
				35	#include <stdlib.h>
				36	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	37	#ifdef LIBXML_ICONV_ENABLED
				38	#ifdef HAVE_ERRNO_H
				39	#include <errno.h>
				40	#endif
				41	#endif
				42	#include <libxml/encoding.h>
				43	#include <libxml/xmlmemory.h>
				44	#ifdef LIBXML_HTML_ENABLED
				45	#include <libxml/HTMLparser.h>
				46	#endif
Daniel Veillard	64a411c	2001-10-15 12:32:07 +0000	[diff] [blame]	47	#include <libxml/globals.h>
Daniel Veillard	a4617b8	2001-11-04 20:19:12 +0000	[diff] [blame]	48	#include <libxml/xmlerror.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	49
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	50	static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
				51	static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	52
				53	typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
				54	typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
				55	struct _xmlCharEncodingAlias {
				56	const char *name;
				57	const char *alias;
				58	};
				59
				60	static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
				61	static int xmlCharEncodingAliasesNb = 0;
				62	static int xmlCharEncodingAliasesMax = 0;
				63
				64	#ifdef LIBXML_ICONV_ENABLED
				65	#if 0
				66	#define DEBUG_ENCODING /* Define this to get encoding traces */
				67	#endif
				68	#endif
				69
				70	static int xmlLittleEndian = 1;
				71
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	72	/************************************************************************
				73	* *
				74	* Generic UTF8 handling routines *
				75	* *
				76	* From rfc2044: encoding of the Unicode values on UTF-8: *
				77	* *
				78	* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
				79	* 0000 0000-0000 007F 0xxxxxxx *
				80	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
				81	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
				82	* *
				83	* I hope we won't use values > 0xFFFF anytime soon ! *
				84	* *
				85	************************************************************************/
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	86
				87	/**
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	88	* xmlUTF8Strlen:
				89	* @utf: a sequence of UTF-8 encoded bytes
				90	*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	91	* compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	92	* checking of the content of the string.
				93	*
				94	* Returns the number of characters in the string or -1 in case of error
				95	*/
				96	int
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	97	xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	98	int ret = 0;
				99
				100	if (utf == NULL)
				101	return(-1);
				102
				103	while (*utf != 0) {
				104	if (utf[0] & 0x80) {
				105	if ((utf[1] & 0xc0) != 0x80)
				106	return(-1);
				107	if ((utf[0] & 0xe0) == 0xe0) {
				108	if ((utf[2] & 0xc0) != 0x80)
				109	return(-1);
				110	if ((utf[0] & 0xf0) == 0xf0) {
				111	if ((utf[0] & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				112	return(-1);
				113	utf += 4;
				114	} else {
				115	utf += 3;
				116	}
				117	} else {
				118	utf += 2;
				119	}
				120	} else {
				121	utf++;
				122	}
				123	ret++;
				124	}
				125	return(ret);
				126	}
				127
				128	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	129	* xmlGetUTF8Char:
				130	* @utf: a sequence of UTF-8 encoded bytes
				131	* @len: a pointer to @bytes len
				132	*
				133	* Read one UTF8 Char from @utf
				134	*
				135	* Returns the char value or -1 in case of error and update @len with the
				136	* number of bytes used
				137	*/
Daniel Veillard	f000f07	2002-10-22 14:28:17 +0000	[diff] [blame]	138	int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	139	xmlGetUTF8Char(const unsigned char utf, int len) {
				140	unsigned int c;
				141
				142	if (utf == NULL)
				143	goto error;
				144	if (len == NULL)
				145	goto error;
				146	if (*len < 1)
				147	goto error;
				148
				149	c = utf[0];
				150	if (c & 0x80) {
				151	if (*len < 2)
				152	goto error;
				153	if ((utf[1] & 0xc0) != 0x80)
				154	goto error;
				155	if ((c & 0xe0) == 0xe0) {
				156	if (*len < 3)
				157	goto error;
				158	if ((utf[2] & 0xc0) != 0x80)
				159	goto error;
				160	if ((c & 0xf0) == 0xf0) {
				161	if (*len < 4)
				162	goto error;
				163	if ((c & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				164	goto error;
				165	*len = 4;
				166	/* 4-byte code */
				167	c = (utf[0] & 0x7) << 18;
				168	c \|= (utf[1] & 0x3f) << 12;
				169	c \|= (utf[2] & 0x3f) << 6;
				170	c \|= utf[3] & 0x3f;
				171	} else {
				172	/* 3-byte code */
				173	*len = 3;
				174	c = (utf[0] & 0xf) << 12;
				175	c \|= (utf[1] & 0x3f) << 6;
				176	c \|= utf[2] & 0x3f;
				177	}
				178	} else {
				179	/* 2-byte code */
				180	*len = 2;
				181	c = (utf[0] & 0x1f) << 6;
				182	c \|= utf[1] & 0x3f;
				183	}
				184	} else {
				185	/* 1-byte code */
				186	*len = 1;
				187	}
				188	return(c);
				189
				190	error:
				191	*len = 0;
				192	return(-1);
				193	}
				194
				195	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	196	* xmlCheckUTF8:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	197	* @utf: Pointer to putative utf-8 encoded string.
				198	*
				199	* Checks @utf for being valid utf-8. @utf is assumed to be
				200	* null-terminated. This function is not super-strict, as it will
				201	* allow longer utf-8 sequences than necessary. Note that Java is
				202	* capable of producing these sequences if provoked. Also note, this
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	203	* routine checks for the 4-byte maximum size, but does not check for
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	204	* 0x10ffff maximum value.
				205	*
				206	* Return value: true if @utf is valid.
				207	**/
				208	int
				209	xmlCheckUTF8(const unsigned char *utf)
				210	{
				211	int ix;
				212	unsigned char c;
				213
				214	for (ix = 0; (c = utf[ix]);) {
				215	if (c & 0x80) {
				216	if ((utf[ix + 1] & 0xc0) != 0x80)
				217	return(0);
				218	if ((c & 0xe0) == 0xe0) {
				219	if ((utf[ix + 2] & 0xc0) != 0x80)
				220	return(0);
				221	if ((c & 0xf0) == 0xf0) {
				222	if ((c & 0xf8) != 0xf0 \|\| (utf[ix + 3] & 0xc0) != 0x80)
				223	return(0);
				224	ix += 4;
				225	/* 4-byte code */
				226	} else
				227	/* 3-byte code */
				228	ix += 3;
				229	} else
				230	/* 2-byte code */
				231	ix += 2;
				232	} else
				233	/* 1-byte code */
				234	ix++;
				235	}
				236	return(1);
				237	}
				238
				239	/**
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	240	* xmlUTF8Strsize:
				241	* @utf: a sequence of UTF-8 encoded bytes
				242	* @len: the number of characters in the array
				243	*
				244	* storage size of an UTF8 string
				245	*
				246	* Returns the storage size of
				247	* the first 'len' characters of ARRAY
				248	*
				249	*/
				250
				251	int
				252	xmlUTF8Strsize(const xmlChar *utf, int len) {
				253	const xmlChar *ptr=utf;
				254	xmlChar ch;
				255
				256	if (len <= 0)
				257	return(0);
				258
				259	while ( len-- > 0) {
				260	if ( !*ptr )
				261	break;
				262	if ( (ch = *ptr++) & 0x80)
				263	while ( (ch<<=1) & 0x80 )
				264	ptr++;
				265	}
				266	return (ptr - utf);
				267	}
				268
				269
				270	/**
				271	* xmlUTF8Strndup:
				272	* @utf: the input UTF8 *
				273	* @len: the len of @utf (in chars)
				274	*
				275	* a strndup for array of UTF8's
				276	*
				277	* Returns a new UTF8 * or NULL
				278	*/
				279	xmlChar *
				280	xmlUTF8Strndup(const xmlChar *utf, int len) {
				281	xmlChar *ret;
				282	int i;
				283
				284	if ((utf == NULL) \|\| (len < 0)) return(NULL);
				285	i = xmlUTF8Strsize(utf, len);
				286	ret = (xmlChar ) xmlMalloc((i + 1) sizeof(xmlChar));
				287	if (ret == NULL) {
				288	xmlGenericError(xmlGenericErrorContext,
				289	"malloc of %ld byte failed\n",
				290	(len + 1) * (long)sizeof(xmlChar));
				291	return(NULL);
				292	}
				293	memcpy(ret, utf, i * sizeof(xmlChar));
				294	ret[i] = 0;
				295	return(ret);
				296	}
				297
				298	/**
				299	* xmlUTF8Strpos:
				300	* @utf: the input UTF8 *
				301	* @pos: the position of the desired UTF8 char (in chars)
				302	*
				303	* a function to provide the equivalent of fetching a
				304	* character from a string array
				305	*
				306	* Returns a pointer to the UTF8 character or NULL
				307	*/
				308	xmlChar *
				309	xmlUTF8Strpos(const xmlChar *utf, int pos) {
				310	xmlChar ch;
				311
				312	if (utf == NULL) return(NULL);
				313	if ( (pos < 0) \|\| (pos >= xmlUTF8Strlen(utf)) )
				314	return(NULL);
				315	while (pos--) {
				316	if ((ch=*utf++) == 0) return(NULL);
				317	if ( ch & 0x80 ) {
				318	/* if not simple ascii, verify proper format */
				319	if ( (ch & 0xc0) != 0xc0 )
				320	return(NULL);
				321	/* then skip over remaining bytes for this char */
				322	while ( (ch <<= 1) & 0x80 )
				323	if ( (*utf++ & 0xc0) != 0x80 )
				324	return(NULL);
				325	}
				326	}
				327	return((xmlChar *)utf);
				328	}
				329
				330	/**
				331	* xmlUTF8Strloc:
				332	* @utf: the input UTF8 *
				333	* @utfchar: the UTF8 character to be found
				334	*
				335	* a function to provide relative location of a UTF8 char
				336	*
				337	* Returns the relative character position of the desired char
				338	* or -1 if not found
				339	*/
				340	int
				341	xmlUTF8Strloc(const xmlChar utf, const xmlChar utfchar) {
				342	int i, size;
				343	xmlChar ch;
				344
				345	if (utf==NULL \|\| utfchar==NULL) return -1;
				346	size = xmlUTF8Strsize(utfchar, 1);
				347	for(i=0; (ch=*utf) != 0; i++) {
				348	if (xmlStrncmp(utf, utfchar, size)==0)
				349	return(i);
				350	utf++;
				351	if ( ch & 0x80 ) {
				352	/* if not simple ascii, verify proper format */
				353	if ( (ch & 0xc0) != 0xc0 )
				354	return(-1);
				355	/* then skip over remaining bytes for this char */
				356	while ( (ch <<= 1) & 0x80 )
				357	if ( (*utf++ & 0xc0) != 0x80 )
				358	return(-1);
				359	}
				360	}
				361
				362	return(-1);
				363	}
				364	/**
				365	* xmlUTF8Strsub:
				366	* @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	367	* @start: relative pos of first char
				368	* @len: total number to copy
				369	*
				370	* Note: positions are given in units of UTF-8 chars
				371	*
				372	* Returns a pointer to a newly created string
				373	* or NULL if any problem
				374	*/
				375
				376	xmlChar *
				377	xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
				378	int i;
				379	xmlChar ch;
				380
				381	if (utf == NULL) return(NULL);
				382	if (start < 0) return(NULL);
				383	if (len < 0) return(NULL);
				384
				385	/*
				386	* Skip over any leading chars
				387	*/
				388	for (i = 0;i < start;i++) {
				389	if ((ch=*utf++) == 0) return(NULL);
				390	if ( ch & 0x80 ) {
				391	/* if not simple ascii, verify proper format */
				392	if ( (ch & 0xc0) != 0xc0 )
				393	return(NULL);
				394	/* then skip over remaining bytes for this char */
				395	while ( (ch <<= 1) & 0x80 )
				396	if ( (*utf++ & 0xc0) != 0x80 )
				397	return(NULL);
				398	}
				399	}
				400
				401	return(xmlUTF8Strndup(utf, len));
				402	}
				403
				404	/************************************************************************
				405	* *
				406	* Conversions To/From UTF8 encoding *
				407	* *
				408	************************************************************************/
				409
				410	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	411	* asciiToUTF8:
				412	* @out: a pointer to an array of bytes to store the result
				413	* @outlen: the length of @out
				414	* @in: a pointer to an array of ASCII chars
				415	* @inlen: the length of @in
				416	*
				417	* Take a block of ASCII chars in and try to convert it to an UTF-8
				418	* block of chars out.
				419	* Returns 0 if success, or -1 otherwise
				420	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	421	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	422	* The value of @outlen after return is the number of ocetes consumed.
				423	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	424	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	425	asciiToUTF8(unsigned char* out, int *outlen,
				426	const unsigned char* in, int *inlen) {
				427	unsigned char* outstart = out;
				428	const unsigned char* base = in;
				429	const unsigned char* processed = in;
				430	unsigned char* outend = out + *outlen;
				431	const unsigned char* inend;
				432	unsigned int c;
				433	int bits;
				434
				435	inend = in + (*inlen);
				436	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				437	c= *in++;
				438
				439	/* assertion: c is a single UTF-4 value */
				440	if (out >= outend)
				441	break;
				442	if (c < 0x80) { *out++= c; bits= -6; }
				443	else {
				444	*outlen = out - outstart;
				445	*inlen = processed - base;
				446	return(-1);
				447	}
				448
				449	for ( ; bits >= 0; bits-= 6) {
				450	if (out >= outend)
				451	break;
				452	*out++= ((c >> bits) & 0x3F) \| 0x80;
				453	}
				454	processed = (const unsigned char*) in;
				455	}
				456	*outlen = out - outstart;
				457	*inlen = processed - base;
				458	return(0);
				459	}
				460
				461	/**
				462	* UTF8Toascii:
				463	* @out: a pointer to an array of bytes to store the result
				464	* @outlen: the length of @out
				465	* @in: a pointer to an array of UTF-8 chars
				466	* @inlen: the length of @in
				467	*
				468	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				469	* block of chars out.
				470	*
				471	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				472	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	473	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	474	* The value of @outlen after return is the number of ocetes consumed.
				475	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	476	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	477	UTF8Toascii(unsigned char* out, int *outlen,
				478	const unsigned char* in, int *inlen) {
				479	const unsigned char* processed = in;
				480	const unsigned char* outend;
				481	const unsigned char* outstart = out;
				482	const unsigned char* instart = in;
				483	const unsigned char* inend;
				484	unsigned int c, d;
				485	int trailing;
				486
				487	if (in == NULL) {
				488	/*
				489	* initialization nothing to do
				490	*/
				491	*outlen = 0;
				492	*inlen = 0;
				493	return(0);
				494	}
				495	inend = in + (*inlen);
				496	outend = out + (*outlen);
				497	while (in < inend) {
				498	d = *in++;
				499	if (d < 0x80) { c= d; trailing= 0; }
				500	else if (d < 0xC0) {
				501	/* trailing byte in leading position */
				502	*outlen = out - outstart;
				503	*inlen = processed - instart;
				504	return(-2);
				505	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				506	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				507	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				508	else {
				509	/* no chance for this in Ascii */
				510	*outlen = out - outstart;
				511	*inlen = processed - instart;
				512	return(-2);
				513	}
				514
				515	if (inend - in < trailing) {
				516	break;
				517	}
				518
				519	for ( ; trailing; trailing--) {
				520	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				521	break;
				522	c <<= 6;
				523	c \|= d & 0x3F;
				524	}
				525
				526	/* assertion: c is a single UTF-4 value */
				527	if (c < 0x80) {
				528	if (out >= outend)
				529	break;
				530	*out++ = c;
				531	} else {
				532	/* no chance for this in Ascii */
				533	*outlen = out - outstart;
				534	*inlen = processed - instart;
				535	return(-2);
				536	}
				537	processed = in;
				538	}
				539	*outlen = out - outstart;
				540	*inlen = processed - instart;
				541	return(0);
				542	}
				543
				544	/**
				545	* isolat1ToUTF8:
				546	* @out: a pointer to an array of bytes to store the result
				547	* @outlen: the length of @out
				548	* @in: a pointer to an array of ISO Latin 1 chars
				549	* @inlen: the length of @in
				550	*
				551	* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
				552	* block of chars out.
				553	* Returns 0 if success, or -1 otherwise
				554	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	555	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	556	* The value of @outlen after return is the number of ocetes consumed.
				557	*/
				558	int
				559	isolat1ToUTF8(unsigned char* out, int *outlen,
				560	const unsigned char* in, int *inlen) {
				561	unsigned char* outstart = out;
				562	const unsigned char* base = in;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	563	unsigned char* outend = out + *outlen;
				564	const unsigned char* inend;
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	565	const unsigned char* instop;
				566	xmlChar c = *in;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	567
				568	inend = in + (*inlen);
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	569	instop = inend;
				570
				571	while (in < inend && out < outend - 1) {
				572	if (c >= 0x80) {
Daniel Veillard	db55291	2002-03-21 13:27:59 +0000	[diff] [blame]	573	*out++= ((c >> 6) & 0x1F) \| 0xC0;
Daniel Veillard	02141ea	2001-04-30 11:46:40 +0000	[diff] [blame]	574	*out++= (c & 0x3F) \| 0x80;
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	575	++in;
				576	c = *in;
				577	}
				578	if (instop - in > outend - out) instop = in + (outend - out);
				579	while (c < 0x80 && in < instop) {
				580	*out++ = c;
				581	++in;
				582	c = *in;
				583	}
				584	}
				585	if (in < inend && out < outend && c < 0x80) {
				586	*out++ = c;
				587	++in;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	588	}
				589	*outlen = out - outstart;
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	590	*inlen = in - base;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	591	return(0);
				592	}
				593
Daniel Veillard	81601f9	2003-01-14 13:42:37 +0000	[diff] [blame]	594	/**
				595	* UTF8ToUTF8:
				596	* @out: a pointer to an array of bytes to store the result
				597	* @outlen: the length of @out
				598	* @inb: a pointer to an array of UTF-8 chars
				599	* @inlenb: the length of @in in UTF-8 chars
				600	*
				601	* No op copy operation for UTF8 handling.
				602	*
				603	* Returns the number of byte written, or -1 by lack of space, or -2
				604	* if the transcoding fails (for *in is not valid utf16 string)
				605	* The value of *inlen after return is the number of octets consumed
				606	* as the return value is positive, else unpredictable.
				607	*/
				608	static int
				609	UTF8ToUTF8(unsigned char* out, int *outlen,
				610	const unsigned char* inb, int *inlenb)
				611	{
				612	int len;
				613
				614	if ((out == NULL) \|\| (inb == NULL) \|\| (outlen == NULL) \|\| (inlenb == NULL))
				615	return(-1);
				616	if (outlen > inlenb) {
				617	len = *inlenb;
				618	} else {
				619	len = *outlen;
				620	}
				621	if (len < 0)
				622	return(-1);
				623
				624	memcpy(out, inb, len);
				625
				626	*outlen = len;
				627	*inlenb = len;
				628	return(0);
				629	}
				630
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	631
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	632	/**
				633	* UTF8Toisolat1:
				634	* @out: a pointer to an array of bytes to store the result
				635	* @outlen: the length of @out
				636	* @in: a pointer to an array of UTF-8 chars
				637	* @inlen: the length of @in
				638	*
				639	* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
				640	* block of chars out.
				641	*
				642	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				643	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	644	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	645	* The value of @outlen after return is the number of ocetes consumed.
				646	*/
				647	int
				648	UTF8Toisolat1(unsigned char* out, int *outlen,
				649	const unsigned char* in, int *inlen) {
				650	const unsigned char* processed = in;
				651	const unsigned char* outend;
				652	const unsigned char* outstart = out;
				653	const unsigned char* instart = in;
				654	const unsigned char* inend;
				655	unsigned int c, d;
				656	int trailing;
				657
				658	if (in == NULL) {
				659	/*
				660	* initialization nothing to do
				661	*/
				662	*outlen = 0;
				663	*inlen = 0;
				664	return(0);
				665	}
				666	inend = in + (*inlen);
				667	outend = out + (*outlen);
				668	while (in < inend) {
				669	d = *in++;
				670	if (d < 0x80) { c= d; trailing= 0; }
				671	else if (d < 0xC0) {
				672	/* trailing byte in leading position */
				673	*outlen = out - outstart;
				674	*inlen = processed - instart;
				675	return(-2);
				676	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				677	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				678	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				679	else {
				680	/* no chance for this in IsoLat1 */
				681	*outlen = out - outstart;
				682	*inlen = processed - instart;
				683	return(-2);
				684	}
				685
				686	if (inend - in < trailing) {
				687	break;
				688	}
				689
				690	for ( ; trailing; trailing--) {
				691	if (in >= inend)
				692	break;
				693	if (((d= *in++) & 0xC0) != 0x80) {
				694	*outlen = out - outstart;
				695	*inlen = processed - instart;
				696	return(-2);
				697	}
				698	c <<= 6;
				699	c \|= d & 0x3F;
				700	}
				701
				702	/* assertion: c is a single UTF-4 value */
				703	if (c <= 0xFF) {
				704	if (out >= outend)
				705	break;
				706	*out++ = c;
				707	} else {
				708	/* no chance for this in IsoLat1 */
				709	*outlen = out - outstart;
				710	*inlen = processed - instart;
				711	return(-2);
				712	}
				713	processed = in;
				714	}
				715	*outlen = out - outstart;
				716	*inlen = processed - instart;
				717	return(0);
				718	}
				719
				720	/**
				721	* UTF16LEToUTF8:
				722	* @out: a pointer to an array of bytes to store the result
				723	* @outlen: the length of @out
				724	* @inb: a pointer to an array of UTF-16LE passwd as a byte array
				725	* @inlenb: the length of @in in UTF-16LE chars
				726	*
				727	* Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	728	* block of chars out. This function assume the endian property
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	729	* is the same between the native type of this machine and the
				730	* inputed one.
				731	*
				732	* Returns the number of byte written, or -1 by lack of space, or -2
				733	* if the transcoding fails (for *in is not valid utf16 string)
				734	* The value of *inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	735	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	736	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	737	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	738	UTF16LEToUTF8(unsigned char* out, int *outlen,
				739	const unsigned char* inb, int *inlenb)
				740	{
				741	unsigned char* outstart = out;
				742	const unsigned char* processed = inb;
				743	unsigned char* outend = out + *outlen;
				744	unsigned short* in = (unsigned short*) inb;
				745	unsigned short* inend;
				746	unsigned int c, d, inlen;
				747	unsigned char *tmp;
				748	int bits;
				749
				750	if ((*inlenb % 2) == 1)
				751	(*inlenb)--;
				752	inlen = *inlenb / 2;
				753	inend = in + inlen;
				754	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				755	if (xmlLittleEndian) {
				756	c= *in++;
				757	} else {
				758	tmp = (unsigned char *) in;
				759	c = *tmp++;
				760	c = c \| (((unsigned int)*tmp) << 8);
				761	in++;
				762	}
				763	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				764	if (in >= inend) { /* (in > inend) shouldn't happens */
				765	break;
				766	}
				767	if (xmlLittleEndian) {
				768	d = *in++;
				769	} else {
				770	tmp = (unsigned char *) in;
				771	d = *tmp++;
				772	d = d \| (((unsigned int)*tmp) << 8);
				773	in++;
				774	}
				775	if ((d & 0xFC00) == 0xDC00) {
				776	c &= 0x03FF;
				777	c <<= 10;
				778	c \|= d & 0x03FF;
				779	c += 0x10000;
				780	}
				781	else {
				782	*outlen = out - outstart;
				783	*inlenb = processed - inb;
				784	return(-2);
				785	}
				786	}
				787
				788	/* assertion: c is a single UTF-4 value */
				789	if (out >= outend)
				790	break;
				791	if (c < 0x80) { *out++= c; bits= -6; }
				792	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				793	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				794	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				795
				796	for ( ; bits >= 0; bits-= 6) {
				797	if (out >= outend)
				798	break;
				799	*out++= ((c >> bits) & 0x3F) \| 0x80;
				800	}
				801	processed = (const unsigned char*) in;
				802	}
				803	*outlen = out - outstart;
				804	*inlenb = processed - inb;
				805	return(0);
				806	}
				807
				808	/**
				809	* UTF8ToUTF16LE:
				810	* @outb: a pointer to an array of bytes to store the result
				811	* @outlen: the length of @outb
				812	* @in: a pointer to an array of UTF-8 chars
				813	* @inlen: the length of @in
				814	*
				815	* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
				816	* block of chars out.
				817	*
				818	* Returns the number of byte written, or -1 by lack of space, or -2
				819	* if the transcoding failed.
				820	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	821	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	822	UTF8ToUTF16LE(unsigned char* outb, int *outlen,
				823	const unsigned char* in, int *inlen)
				824	{
				825	unsigned short* out = (unsigned short*) outb;
				826	const unsigned char* processed = in;
				827	unsigned short* outstart= out;
				828	unsigned short* outend;
				829	const unsigned char* inend= in+*inlen;
				830	unsigned int c, d;
				831	int trailing;
				832	unsigned char *tmp;
				833	unsigned short tmp1, tmp2;
				834
				835	if (in == NULL) {
				836	/*
				837	* initialization, add the Byte Order Mark
				838	*/
				839	if (*outlen >= 2) {
				840	outb[0] = 0xFF;
				841	outb[1] = 0xFE;
				842	*outlen = 2;
				843	*inlen = 0;
				844	#ifdef DEBUG_ENCODING
				845	xmlGenericError(xmlGenericErrorContext,
				846	"Added FFFE Byte Order Mark\n");
				847	#endif
				848	return(2);
				849	}
				850	*outlen = 0;
				851	*inlen = 0;
				852	return(0);
				853	}
				854	outend = out + (*outlen / 2);
				855	while (in < inend) {
				856	d= *in++;
				857	if (d < 0x80) { c= d; trailing= 0; }
				858	else if (d < 0xC0) {
				859	/* trailing byte in leading position */
				860	outlen = (out - outstart) 2;
				861	*inlen = processed - in;
				862	return(-2);
				863	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				864	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				865	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				866	else {
				867	/* no chance for this in UTF-16 */
				868	outlen = (out - outstart) 2;
				869	*inlen = processed - in;
				870	return(-2);
				871	}
				872
				873	if (inend - in < trailing) {
				874	break;
				875	}
				876
				877	for ( ; trailing; trailing--) {
				878	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				879	break;
				880	c <<= 6;
				881	c \|= d & 0x3F;
				882	}
				883
				884	/* assertion: c is a single UTF-4 value */
				885	if (c < 0x10000) {
				886	if (out >= outend)
				887	break;
				888	if (xmlLittleEndian) {
				889	*out++ = c;
				890	} else {
				891	tmp = (unsigned char *) out;
				892	*tmp = c ;
				893	*(tmp + 1) = c >> 8 ;
				894	out++;
				895	}
				896	}
				897	else if (c < 0x110000) {
				898	if (out+1 >= outend)
				899	break;
				900	c -= 0x10000;
				901	if (xmlLittleEndian) {
				902	*out++ = 0xD800 \| (c >> 10);
				903	*out++ = 0xDC00 \| (c & 0x03FF);
				904	} else {
				905	tmp1 = 0xD800 \| (c >> 10);
				906	tmp = (unsigned char *) out;
				907	*tmp = (unsigned char) tmp1;
				908	*(tmp + 1) = tmp1 >> 8;
				909	out++;
				910
				911	tmp2 = 0xDC00 \| (c & 0x03FF);
				912	tmp = (unsigned char *) out;
				913	*tmp = (unsigned char) tmp2;
				914	*(tmp + 1) = tmp2 >> 8;
				915	out++;
				916	}
				917	}
				918	else
				919	break;
				920	processed = in;
				921	}
				922	outlen = (out - outstart) 2;
				923	*inlen = processed - in;
				924	return(0);
				925	}
				926
				927	/**
				928	* UTF16BEToUTF8:
				929	* @out: a pointer to an array of bytes to store the result
				930	* @outlen: the length of @out
				931	* @inb: a pointer to an array of UTF-16 passwd as a byte array
				932	* @inlenb: the length of @in in UTF-16 chars
				933	*
				934	* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	935	* block of chars out. This function assume the endian property
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	936	* is the same between the native type of this machine and the
				937	* inputed one.
				938	*
				939	* Returns the number of byte written, or -1 by lack of space, or -2
				940	* if the transcoding fails (for *in is not valid utf16 string)
				941	* The value of *inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	942	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	943	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	944	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	945	UTF16BEToUTF8(unsigned char* out, int *outlen,
				946	const unsigned char* inb, int *inlenb)
				947	{
				948	unsigned char* outstart = out;
				949	const unsigned char* processed = inb;
				950	unsigned char* outend = out + *outlen;
				951	unsigned short* in = (unsigned short*) inb;
				952	unsigned short* inend;
				953	unsigned int c, d, inlen;
				954	unsigned char *tmp;
				955	int bits;
				956
				957	if ((*inlenb % 2) == 1)
				958	(*inlenb)--;
				959	inlen = *inlenb / 2;
				960	inend= in + inlen;
				961	while (in < inend) {
				962	if (xmlLittleEndian) {
				963	tmp = (unsigned char *) in;
				964	c = *tmp++;
				965	c = c << 8;
				966	c = c \| (unsigned int) *tmp;
				967	in++;
				968	} else {
				969	c= *in++;
				970	}
				971	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				972	if (in >= inend) { /* (in > inend) shouldn't happens */
				973	*outlen = out - outstart;
				974	*inlenb = processed - inb;
				975	return(-2);
				976	}
				977	if (xmlLittleEndian) {
				978	tmp = (unsigned char *) in;
				979	d = *tmp++;
				980	d = d << 8;
				981	d = d \| (unsigned int) *tmp;
				982	in++;
				983	} else {
				984	d= *in++;
				985	}
				986	if ((d & 0xFC00) == 0xDC00) {
				987	c &= 0x03FF;
				988	c <<= 10;
				989	c \|= d & 0x03FF;
				990	c += 0x10000;
				991	}
				992	else {
				993	*outlen = out - outstart;
				994	*inlenb = processed - inb;
				995	return(-2);
				996	}
				997	}
				998
				999	/* assertion: c is a single UTF-4 value */
				1000	if (out >= outend)
				1001	break;
				1002	if (c < 0x80) { *out++= c; bits= -6; }
				1003	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1004	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1005	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1006
				1007	for ( ; bits >= 0; bits-= 6) {
				1008	if (out >= outend)
				1009	break;
				1010	*out++= ((c >> bits) & 0x3F) \| 0x80;
				1011	}
				1012	processed = (const unsigned char*) in;
				1013	}
				1014	*outlen = out - outstart;
				1015	*inlenb = processed - inb;
				1016	return(0);
				1017	}
				1018
				1019	/**
				1020	* UTF8ToUTF16BE:
				1021	* @outb: a pointer to an array of bytes to store the result
				1022	* @outlen: the length of @outb
				1023	* @in: a pointer to an array of UTF-8 chars
				1024	* @inlen: the length of @in
				1025	*
				1026	* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
				1027	* block of chars out.
				1028	*
				1029	* Returns the number of byte written, or -1 by lack of space, or -2
				1030	* if the transcoding failed.
				1031	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1032	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1033	UTF8ToUTF16BE(unsigned char* outb, int *outlen,
				1034	const unsigned char* in, int *inlen)
				1035	{
				1036	unsigned short* out = (unsigned short*) outb;
				1037	const unsigned char* processed = in;
				1038	unsigned short* outstart= out;
				1039	unsigned short* outend;
				1040	const unsigned char* inend= in+*inlen;
				1041	unsigned int c, d;
				1042	int trailing;
				1043	unsigned char *tmp;
				1044	unsigned short tmp1, tmp2;
				1045
				1046	if (in == NULL) {
				1047	/*
				1048	* initialization, add the Byte Order Mark
				1049	*/
				1050	if (*outlen >= 2) {
				1051	outb[0] = 0xFE;
				1052	outb[1] = 0xFF;
				1053	*outlen = 2;
				1054	*inlen = 0;
				1055	#ifdef DEBUG_ENCODING
				1056	xmlGenericError(xmlGenericErrorContext,
				1057	"Added FEFF Byte Order Mark\n");
				1058	#endif
				1059	return(2);
				1060	}
				1061	*outlen = 0;
				1062	*inlen = 0;
				1063	return(0);
				1064	}
				1065	outend = out + (*outlen / 2);
				1066	while (in < inend) {
				1067	d= *in++;
				1068	if (d < 0x80) { c= d; trailing= 0; }
				1069	else if (d < 0xC0) {
				1070	/* trailing byte in leading position */
				1071	*outlen = out - outstart;
				1072	*inlen = processed - in;
				1073	return(-2);
				1074	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1075	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1076	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1077	else {
				1078	/* no chance for this in UTF-16 */
				1079	*outlen = out - outstart;
				1080	*inlen = processed - in;
				1081	return(-2);
				1082	}
				1083
				1084	if (inend - in < trailing) {
				1085	break;
				1086	}
				1087
				1088	for ( ; trailing; trailing--) {
				1089	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80)) break;
				1090	c <<= 6;
				1091	c \|= d & 0x3F;
				1092	}
				1093
				1094	/* assertion: c is a single UTF-4 value */
				1095	if (c < 0x10000) {
				1096	if (out >= outend) break;
				1097	if (xmlLittleEndian) {
				1098	tmp = (unsigned char *) out;
				1099	*tmp = c >> 8;
				1100	*(tmp + 1) = c;
				1101	out++;
				1102	} else {
				1103	*out++ = c;
				1104	}
				1105	}
				1106	else if (c < 0x110000) {
				1107	if (out+1 >= outend) break;
				1108	c -= 0x10000;
				1109	if (xmlLittleEndian) {
				1110	tmp1 = 0xD800 \| (c >> 10);
				1111	tmp = (unsigned char *) out;
				1112	*tmp = tmp1 >> 8;
				1113	*(tmp + 1) = (unsigned char) tmp1;
				1114	out++;
				1115
				1116	tmp2 = 0xDC00 \| (c & 0x03FF);
				1117	tmp = (unsigned char *) out;
				1118	*tmp = tmp2 >> 8;
				1119	*(tmp + 1) = (unsigned char) tmp2;
				1120	out++;
				1121	} else {
				1122	*out++ = 0xD800 \| (c >> 10);
				1123	*out++ = 0xDC00 \| (c & 0x03FF);
				1124	}
				1125	}
				1126	else
				1127	break;
				1128	processed = in;
				1129	}
				1130	outlen = (out - outstart) 2;
				1131	*inlen = processed - in;
				1132	return(0);
				1133	}
				1134
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1135	/************************************************************************
				1136	* *
				1137	* Generic encoding handling routines *
				1138	* *
				1139	************************************************************************/
				1140
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1141	/**
				1142	* xmlDetectCharEncoding:
				1143	* @in: a pointer to the first bytes of the XML entity, must be at least
				1144	* 4 bytes long.
				1145	* @len: pointer to the length of the buffer
				1146	*
				1147	* Guess the encoding of the entity using the first bytes of the entity content
				1148	* accordingly of the non-normative appendix F of the XML-1.0 recommendation.
				1149	*
				1150	* Returns one of the XML_CHAR_ENCODING_... values.
				1151	*/
				1152	xmlCharEncoding
				1153	xmlDetectCharEncoding(const unsigned char* in, int len)
				1154	{
				1155	if (len >= 4) {
				1156	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				1157	(in[2] == 0x00) && (in[3] == 0x3C))
				1158	return(XML_CHAR_ENCODING_UCS4BE);
				1159	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
				1160	(in[2] == 0x00) && (in[3] == 0x00))
				1161	return(XML_CHAR_ENCODING_UCS4LE);
				1162	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				1163	(in[2] == 0x3C) && (in[3] == 0x00))
				1164	return(XML_CHAR_ENCODING_UCS4_2143);
				1165	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
				1166	(in[2] == 0x00) && (in[3] == 0x00))
				1167	return(XML_CHAR_ENCODING_UCS4_3412);
				1168	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
				1169	(in[2] == 0xA7) && (in[3] == 0x94))
				1170	return(XML_CHAR_ENCODING_EBCDIC);
				1171	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
				1172	(in[2] == 0x78) && (in[3] == 0x6D))
				1173	return(XML_CHAR_ENCODING_UTF8);
				1174	}
Daniel Veillard	87a764e	2001-06-20 17:41:10 +0000	[diff] [blame]	1175	if (len >= 3) {
				1176	/*
				1177	* Errata on XML-1.0 June 20 2001
				1178	* We now allow an UTF8 encoded BOM
				1179	*/
				1180	if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
				1181	(in[2] == 0xBF))
				1182	return(XML_CHAR_ENCODING_UTF8);
				1183	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1184	if (len >= 2) {
				1185	if ((in[0] == 0xFE) && (in[1] == 0xFF))
				1186	return(XML_CHAR_ENCODING_UTF16BE);
				1187	if ((in[0] == 0xFF) && (in[1] == 0xFE))
				1188	return(XML_CHAR_ENCODING_UTF16LE);
				1189	}
				1190	return(XML_CHAR_ENCODING_NONE);
				1191	}
				1192
				1193	/**
				1194	* xmlCleanupEncodingAliases:
				1195	*
				1196	* Unregisters all aliases
				1197	*/
				1198	void
				1199	xmlCleanupEncodingAliases(void) {
				1200	int i;
				1201
				1202	if (xmlCharEncodingAliases == NULL)
				1203	return;
				1204
				1205	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1206	if (xmlCharEncodingAliases[i].name != NULL)
				1207	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1208	if (xmlCharEncodingAliases[i].alias != NULL)
				1209	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				1210	}
				1211	xmlCharEncodingAliasesNb = 0;
				1212	xmlCharEncodingAliasesMax = 0;
				1213	xmlFree(xmlCharEncodingAliases);
Daniel Veillard	73c6e53	2002-01-08 13:15:33 +0000	[diff] [blame]	1214	xmlCharEncodingAliases = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1215	}
				1216
				1217	/**
				1218	* xmlGetEncodingAlias:
				1219	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1220	*
				1221	* Lookup an encoding name for the given alias.
				1222	*
				1223	* Returns NULL if not found the original name otherwise
				1224	*/
				1225	const char *
				1226	xmlGetEncodingAlias(const char *alias) {
				1227	int i;
				1228	char upper[100];
				1229
				1230	if (alias == NULL)
				1231	return(NULL);
				1232
				1233	if (xmlCharEncodingAliases == NULL)
				1234	return(NULL);
				1235
				1236	for (i = 0;i < 99;i++) {
				1237	upper[i] = toupper(alias[i]);
				1238	if (upper[i] == 0) break;
				1239	}
				1240	upper[i] = 0;
				1241
				1242	/*
				1243	* Walk down the list looking for a definition of the alias
				1244	*/
				1245	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1246	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1247	return(xmlCharEncodingAliases[i].name);
				1248	}
				1249	}
				1250	return(NULL);
				1251	}
				1252
				1253	/**
				1254	* xmlAddEncodingAlias:
				1255	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1256	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1257	*
				1258	* Registers and alias @alias for an encoding named @name. Existing alias
				1259	* will be overwritten.
				1260	*
				1261	* Returns 0 in case of success, -1 in case of error
				1262	*/
				1263	int
				1264	xmlAddEncodingAlias(const char name, const char alias) {
				1265	int i;
				1266	char upper[100];
				1267
				1268	if ((name == NULL) \|\| (alias == NULL))
				1269	return(-1);
				1270
				1271	for (i = 0;i < 99;i++) {
				1272	upper[i] = toupper(alias[i]);
				1273	if (upper[i] == 0) break;
				1274	}
				1275	upper[i] = 0;
				1276
				1277	if (xmlCharEncodingAliases == NULL) {
				1278	xmlCharEncodingAliasesNb = 0;
				1279	xmlCharEncodingAliasesMax = 20;
				1280	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1281	xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1282	if (xmlCharEncodingAliases == NULL)
				1283	return(-1);
				1284	} else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
				1285	xmlCharEncodingAliasesMax *= 2;
				1286	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1287	xmlRealloc(xmlCharEncodingAliases,
				1288	xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1289	}
				1290	/*
				1291	* Walk down the list looking for a definition of the alias
				1292	*/
				1293	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1294	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1295	/*
				1296	* Replace the definition.
				1297	*/
				1298	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1299	xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
				1300	return(0);
				1301	}
				1302	}
				1303	/*
				1304	* Add the definition
				1305	*/
				1306	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
				1307	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
				1308	xmlCharEncodingAliasesNb++;
				1309	return(0);
				1310	}
				1311
				1312	/**
				1313	* xmlDelEncodingAlias:
				1314	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1315	*
				1316	* Unregisters an encoding alias @alias
				1317	*
				1318	* Returns 0 in case of success, -1 in case of error
				1319	*/
				1320	int
				1321	xmlDelEncodingAlias(const char *alias) {
				1322	int i;
				1323
				1324	if (alias == NULL)
				1325	return(-1);
				1326
				1327	if (xmlCharEncodingAliases == NULL)
				1328	return(-1);
				1329	/*
				1330	* Walk down the list looking for a definition of the alias
				1331	*/
				1332	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1333	if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
				1334	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1335	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				1336	xmlCharEncodingAliasesNb--;
				1337	memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
				1338	sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
				1339	return(0);
				1340	}
				1341	}
				1342	return(-1);
				1343	}
				1344
				1345	/**
				1346	* xmlParseCharEncoding:
				1347	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1348	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1349	* Compare the string to the known encoding schemes already known. Note
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1350	* that the comparison is case insensitive accordingly to the section
				1351	* [XML] 4.3.3 Character Encoding in Entities.
				1352	*
				1353	* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
				1354	* if not recognized.
				1355	*/
				1356	xmlCharEncoding
				1357	xmlParseCharEncoding(const char* name)
				1358	{
				1359	const char *alias;
				1360	char upper[500];
				1361	int i;
				1362
				1363	if (name == NULL)
				1364	return(XML_CHAR_ENCODING_NONE);
				1365
				1366	/*
				1367	* Do the alias resolution
				1368	*/
				1369	alias = xmlGetEncodingAlias(name);
				1370	if (alias != NULL)
				1371	name = alias;
				1372
				1373	for (i = 0;i < 499;i++) {
				1374	upper[i] = toupper(name[i]);
				1375	if (upper[i] == 0) break;
				1376	}
				1377	upper[i] = 0;
				1378
				1379	if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
				1380	if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
				1381	if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
				1382
				1383	/*
				1384	* NOTE: if we were able to parse this, the endianness of UTF16 is
				1385	* already found and in use
				1386	*/
				1387	if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
				1388	if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
				1389
				1390	if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1391	if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1392	if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
				1393
				1394	/*
				1395	* NOTE: if we were able to parse this, the endianness of UCS4 is
				1396	* already found and in use
				1397	*/
				1398	if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1399	if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1400	if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
				1401
				1402
				1403	if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
				1404	if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
				1405	if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
				1406
				1407	if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
				1408	if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
				1409	if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
				1410
				1411	if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
				1412	if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
				1413	if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
				1414	if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
				1415	if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
				1416	if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
				1417	if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
				1418
				1419	if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
				1420	if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
				1421	if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
				1422
				1423	#ifdef DEBUG_ENCODING
				1424	xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
				1425	#endif
				1426	return(XML_CHAR_ENCODING_ERROR);
				1427	}
				1428
				1429	/**
				1430	* xmlGetCharEncodingName:
				1431	* @enc: the encoding
				1432	*
				1433	* The "canonical" name for XML encoding.
				1434	* C.f. http://www.w3.org/TR/REC-xml#charencoding
				1435	* Section 4.3.3 Character Encoding in Entities
				1436	*
				1437	* Returns the canonical name for the given encoding
				1438	*/
				1439
				1440	const char*
				1441	xmlGetCharEncodingName(xmlCharEncoding enc) {
				1442	switch (enc) {
				1443	case XML_CHAR_ENCODING_ERROR:
				1444	return(NULL);
				1445	case XML_CHAR_ENCODING_NONE:
				1446	return(NULL);
				1447	case XML_CHAR_ENCODING_UTF8:
				1448	return("UTF-8");
				1449	case XML_CHAR_ENCODING_UTF16LE:
				1450	return("UTF-16");
				1451	case XML_CHAR_ENCODING_UTF16BE:
				1452	return("UTF-16");
				1453	case XML_CHAR_ENCODING_EBCDIC:
				1454	return("EBCDIC");
				1455	case XML_CHAR_ENCODING_UCS4LE:
				1456	return("ISO-10646-UCS-4");
				1457	case XML_CHAR_ENCODING_UCS4BE:
				1458	return("ISO-10646-UCS-4");
				1459	case XML_CHAR_ENCODING_UCS4_2143:
				1460	return("ISO-10646-UCS-4");
				1461	case XML_CHAR_ENCODING_UCS4_3412:
				1462	return("ISO-10646-UCS-4");
				1463	case XML_CHAR_ENCODING_UCS2:
				1464	return("ISO-10646-UCS-2");
				1465	case XML_CHAR_ENCODING_8859_1:
				1466	return("ISO-8859-1");
				1467	case XML_CHAR_ENCODING_8859_2:
				1468	return("ISO-8859-2");
				1469	case XML_CHAR_ENCODING_8859_3:
				1470	return("ISO-8859-3");
				1471	case XML_CHAR_ENCODING_8859_4:
				1472	return("ISO-8859-4");
				1473	case XML_CHAR_ENCODING_8859_5:
				1474	return("ISO-8859-5");
				1475	case XML_CHAR_ENCODING_8859_6:
				1476	return("ISO-8859-6");
				1477	case XML_CHAR_ENCODING_8859_7:
				1478	return("ISO-8859-7");
				1479	case XML_CHAR_ENCODING_8859_8:
				1480	return("ISO-8859-8");
				1481	case XML_CHAR_ENCODING_8859_9:
				1482	return("ISO-8859-9");
				1483	case XML_CHAR_ENCODING_2022_JP:
				1484	return("ISO-2022-JP");
				1485	case XML_CHAR_ENCODING_SHIFT_JIS:
				1486	return("Shift-JIS");
				1487	case XML_CHAR_ENCODING_EUC_JP:
				1488	return("EUC-JP");
				1489	case XML_CHAR_ENCODING_ASCII:
				1490	return(NULL);
				1491	}
				1492	return(NULL);
				1493	}
				1494
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1495	/************************************************************************
				1496	* *
				1497	* Char encoding handlers *
				1498	* *
				1499	************************************************************************/
				1500
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1501
				1502	/* the size should be growable, but it's not a big deal ... */
				1503	#define MAX_ENCODING_HANDLERS 50
				1504	static xmlCharEncodingHandlerPtr *handlers = NULL;
				1505	static int nbCharEncodingHandler = 0;
				1506
				1507	/*
				1508	* The default is UTF-8 for XML, that's also the default used for the
				1509	* parser internals, so the default encoding handler is NULL
				1510	*/
				1511
				1512	static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
				1513
				1514	/**
				1515	* xmlNewCharEncodingHandler:
				1516	* @name: the encoding name, in UTF-8 format (ASCII actually)
				1517	* @input: the xmlCharEncodingInputFunc to read that encoding
				1518	* @output: the xmlCharEncodingOutputFunc to write that encoding
				1519	*
				1520	* Create and registers an xmlCharEncodingHandler.
Daniel Veillard	6f46f6c	2002-08-01 12:22:24 +0000	[diff] [blame]	1521	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1522	* Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
				1523	*/
Daniel Veillard	6f46f6c	2002-08-01 12:22:24 +0000	[diff] [blame]	1524	xmlCharEncodingHandlerPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1525	xmlNewCharEncodingHandler(const char *name,
				1526	xmlCharEncodingInputFunc input,
				1527	xmlCharEncodingOutputFunc output) {
				1528	xmlCharEncodingHandlerPtr handler;
				1529	const char *alias;
				1530	char upper[500];
				1531	int i;
				1532	char *up = 0;
				1533
				1534	/*
				1535	* Do the alias resolution
				1536	*/
				1537	alias = xmlGetEncodingAlias(name);
				1538	if (alias != NULL)
				1539	name = alias;
				1540
				1541	/*
				1542	* Keep only the uppercase version of the encoding.
				1543	*/
				1544	if (name == NULL) {
				1545	xmlGenericError(xmlGenericErrorContext,
				1546	"xmlNewCharEncodingHandler : no name !\n");
				1547	return(NULL);
				1548	}
				1549	for (i = 0;i < 499;i++) {
				1550	upper[i] = toupper(name[i]);
				1551	if (upper[i] == 0) break;
				1552	}
				1553	upper[i] = 0;
				1554	up = xmlMemStrdup(upper);
				1555	if (up == NULL) {
				1556	xmlGenericError(xmlGenericErrorContext,
				1557	"xmlNewCharEncodingHandler : out of memory !\n");
				1558	return(NULL);
				1559	}
				1560
				1561	/*
				1562	* allocate and fill-up an handler block.
				1563	*/
				1564	handler = (xmlCharEncodingHandlerPtr)
				1565	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1566	if (handler == NULL) {
				1567	xmlGenericError(xmlGenericErrorContext,
				1568	"xmlNewCharEncodingHandler : out of memory !\n");
				1569	return(NULL);
				1570	}
				1571	handler->input = input;
				1572	handler->output = output;
				1573	handler->name = up;
				1574
				1575	#ifdef LIBXML_ICONV_ENABLED
				1576	handler->iconv_in = NULL;
				1577	handler->iconv_out = NULL;
				1578	#endif /* LIBXML_ICONV_ENABLED */
				1579
				1580	/*
				1581	* registers and returns the handler.
				1582	*/
				1583	xmlRegisterCharEncodingHandler(handler);
				1584	#ifdef DEBUG_ENCODING
				1585	xmlGenericError(xmlGenericErrorContext,
				1586	"Registered encoding handler for %s\n", name);
				1587	#endif
				1588	return(handler);
				1589	}
				1590
				1591	/**
				1592	* xmlInitCharEncodingHandlers:
				1593	*
				1594	* Initialize the char encoding support, it registers the default
				1595	* encoding supported.
				1596	* NOTE: while public, this function usually doesn't need to be called
				1597	* in normal processing.
				1598	*/
				1599	void
				1600	xmlInitCharEncodingHandlers(void) {
				1601	unsigned short int tst = 0x1234;
				1602	unsigned char ptr = (unsigned char ) &tst;
				1603
				1604	if (handlers != NULL) return;
				1605
				1606	handlers = (xmlCharEncodingHandlerPtr *)
				1607	xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
				1608
				1609	if (*ptr == 0x12) xmlLittleEndian = 0;
				1610	else if (*ptr == 0x34) xmlLittleEndian = 1;
				1611	else xmlGenericError(xmlGenericErrorContext,
				1612	"Odd problem at endianness detection\n");
				1613
				1614	if (handlers == NULL) {
				1615	xmlGenericError(xmlGenericErrorContext,
				1616	"xmlInitCharEncodingHandlers : out of memory !\n");
				1617	return;
				1618	}
Daniel Veillard	81601f9	2003-01-14 13:42:37 +0000	[diff] [blame]	1619	xmlNewCharEncodingHandler("UTF-8", UTF8ToUTF8, UTF8ToUTF8);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1620	xmlUTF16LEHandler =
				1621	xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
				1622	xmlUTF16BEHandler =
				1623	xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
				1624	xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
				1625	xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard	2004242	2001-05-31 18:22:04 +0000	[diff] [blame]	1626	xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1627	#ifdef LIBXML_HTML_ENABLED
				1628	xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
				1629	#endif
				1630	}
				1631
				1632	/**
				1633	* xmlCleanupCharEncodingHandlers:
				1634	*
				1635	* Cleanup the memory allocated for the char encoding support, it
				1636	* unregisters all the encoding handlers and the aliases.
				1637	*/
				1638	void
				1639	xmlCleanupCharEncodingHandlers(void) {
				1640	xmlCleanupEncodingAliases();
				1641
				1642	if (handlers == NULL) return;
				1643
				1644	for (;nbCharEncodingHandler > 0;) {
				1645	nbCharEncodingHandler--;
				1646	if (handlers[nbCharEncodingHandler] != NULL) {
				1647	if (handlers[nbCharEncodingHandler]->name != NULL)
				1648	xmlFree(handlers[nbCharEncodingHandler]->name);
				1649	xmlFree(handlers[nbCharEncodingHandler]);
				1650	}
				1651	}
				1652	xmlFree(handlers);
				1653	handlers = NULL;
				1654	nbCharEncodingHandler = 0;
				1655	xmlDefaultCharEncodingHandler = NULL;
				1656	}
				1657
				1658	/**
				1659	* xmlRegisterCharEncodingHandler:
				1660	* @handler: the xmlCharEncodingHandlerPtr handler block
				1661	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1662	* Register the char encoding handler, surprising, isn't it ?
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1663	*/
				1664	void
				1665	xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
				1666	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1667	if (handler == NULL) {
				1668	xmlGenericError(xmlGenericErrorContext,
				1669	"xmlRegisterCharEncodingHandler: NULL handler !\n");
				1670	return;
				1671	}
				1672
				1673	if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
				1674	xmlGenericError(xmlGenericErrorContext,
				1675	"xmlRegisterCharEncodingHandler: Too many handler registered\n");
				1676	xmlGenericError(xmlGenericErrorContext,
				1677	"\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
				1678	return;
				1679	}
				1680	handlers[nbCharEncodingHandler++] = handler;
				1681	}
				1682
				1683	/**
				1684	* xmlGetCharEncodingHandler:
				1685	* @enc: an xmlCharEncoding value.
				1686	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1687	* Search in the registered set the handler able to read/write that encoding.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1688	*
				1689	* Returns the handler or NULL if not found
				1690	*/
				1691	xmlCharEncodingHandlerPtr
				1692	xmlGetCharEncodingHandler(xmlCharEncoding enc) {
				1693	xmlCharEncodingHandlerPtr handler;
				1694
				1695	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1696	switch (enc) {
				1697	case XML_CHAR_ENCODING_ERROR:
				1698	return(NULL);
				1699	case XML_CHAR_ENCODING_NONE:
				1700	return(NULL);
				1701	case XML_CHAR_ENCODING_UTF8:
				1702	return(NULL);
				1703	case XML_CHAR_ENCODING_UTF16LE:
				1704	return(xmlUTF16LEHandler);
				1705	case XML_CHAR_ENCODING_UTF16BE:
				1706	return(xmlUTF16BEHandler);
				1707	case XML_CHAR_ENCODING_EBCDIC:
				1708	handler = xmlFindCharEncodingHandler("EBCDIC");
				1709	if (handler != NULL) return(handler);
				1710	handler = xmlFindCharEncodingHandler("ebcdic");
				1711	if (handler != NULL) return(handler);
				1712	break;
				1713	case XML_CHAR_ENCODING_UCS4BE:
				1714	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1715	if (handler != NULL) return(handler);
				1716	handler = xmlFindCharEncodingHandler("UCS-4");
				1717	if (handler != NULL) return(handler);
				1718	handler = xmlFindCharEncodingHandler("UCS4");
				1719	if (handler != NULL) return(handler);
				1720	break;
				1721	case XML_CHAR_ENCODING_UCS4LE:
				1722	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1723	if (handler != NULL) return(handler);
				1724	handler = xmlFindCharEncodingHandler("UCS-4");
				1725	if (handler != NULL) return(handler);
				1726	handler = xmlFindCharEncodingHandler("UCS4");
				1727	if (handler != NULL) return(handler);
				1728	break;
				1729	case XML_CHAR_ENCODING_UCS4_2143:
				1730	break;
				1731	case XML_CHAR_ENCODING_UCS4_3412:
				1732	break;
				1733	case XML_CHAR_ENCODING_UCS2:
				1734	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
				1735	if (handler != NULL) return(handler);
				1736	handler = xmlFindCharEncodingHandler("UCS-2");
				1737	if (handler != NULL) return(handler);
				1738	handler = xmlFindCharEncodingHandler("UCS2");
				1739	if (handler != NULL) return(handler);
				1740	break;
				1741
				1742	/*
				1743	* We used to keep ISO Latin encodings native in the
				1744	* generated data. This led to so many problems that
				1745	* this has been removed. One can still change this
				1746	* back by registering no-ops encoders for those
				1747	*/
				1748	case XML_CHAR_ENCODING_8859_1:
				1749	handler = xmlFindCharEncodingHandler("ISO-8859-1");
				1750	if (handler != NULL) return(handler);
				1751	break;
				1752	case XML_CHAR_ENCODING_8859_2:
				1753	handler = xmlFindCharEncodingHandler("ISO-8859-2");
				1754	if (handler != NULL) return(handler);
				1755	break;
				1756	case XML_CHAR_ENCODING_8859_3:
				1757	handler = xmlFindCharEncodingHandler("ISO-8859-3");
				1758	if (handler != NULL) return(handler);
				1759	break;
				1760	case XML_CHAR_ENCODING_8859_4:
				1761	handler = xmlFindCharEncodingHandler("ISO-8859-4");
				1762	if (handler != NULL) return(handler);
				1763	break;
				1764	case XML_CHAR_ENCODING_8859_5:
				1765	handler = xmlFindCharEncodingHandler("ISO-8859-5");
				1766	if (handler != NULL) return(handler);
				1767	break;
				1768	case XML_CHAR_ENCODING_8859_6:
				1769	handler = xmlFindCharEncodingHandler("ISO-8859-6");
				1770	if (handler != NULL) return(handler);
				1771	break;
				1772	case XML_CHAR_ENCODING_8859_7:
				1773	handler = xmlFindCharEncodingHandler("ISO-8859-7");
				1774	if (handler != NULL) return(handler);
				1775	break;
				1776	case XML_CHAR_ENCODING_8859_8:
				1777	handler = xmlFindCharEncodingHandler("ISO-8859-8");
				1778	if (handler != NULL) return(handler);
				1779	break;
				1780	case XML_CHAR_ENCODING_8859_9:
				1781	handler = xmlFindCharEncodingHandler("ISO-8859-9");
				1782	if (handler != NULL) return(handler);
				1783	break;
				1784
				1785
				1786	case XML_CHAR_ENCODING_2022_JP:
				1787	handler = xmlFindCharEncodingHandler("ISO-2022-JP");
				1788	if (handler != NULL) return(handler);
				1789	break;
				1790	case XML_CHAR_ENCODING_SHIFT_JIS:
				1791	handler = xmlFindCharEncodingHandler("SHIFT-JIS");
				1792	if (handler != NULL) return(handler);
				1793	handler = xmlFindCharEncodingHandler("SHIFT_JIS");
				1794	if (handler != NULL) return(handler);
				1795	handler = xmlFindCharEncodingHandler("Shift_JIS");
				1796	if (handler != NULL) return(handler);
				1797	break;
				1798	case XML_CHAR_ENCODING_EUC_JP:
				1799	handler = xmlFindCharEncodingHandler("EUC-JP");
				1800	if (handler != NULL) return(handler);
				1801	break;
				1802	default:
				1803	break;
				1804	}
				1805
				1806	#ifdef DEBUG_ENCODING
				1807	xmlGenericError(xmlGenericErrorContext,
				1808	"No handler found for encoding %d\n", enc);
				1809	#endif
				1810	return(NULL);
				1811	}
				1812
				1813	/**
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1814	* xmlFindCharEncodingHandler:
				1815	* @name: a string describing the char encoding.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1816	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1817	* Search in the registered set the handler able to read/write that encoding.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1818	*
				1819	* Returns the handler or NULL if not found
				1820	*/
				1821	xmlCharEncodingHandlerPtr
				1822	xmlFindCharEncodingHandler(const char *name) {
				1823	const char *nalias;
				1824	const char *norig;
				1825	xmlCharEncoding alias;
				1826	#ifdef LIBXML_ICONV_ENABLED
				1827	xmlCharEncodingHandlerPtr enc;
				1828	iconv_t icv_in, icv_out;
				1829	#endif /* LIBXML_ICONV_ENABLED */
				1830	char upper[100];
				1831	int i;
				1832
				1833	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1834	if (name == NULL) return(xmlDefaultCharEncodingHandler);
				1835	if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
				1836
				1837	/*
				1838	* Do the alias resolution
				1839	*/
				1840	norig = name;
				1841	nalias = xmlGetEncodingAlias(name);
				1842	if (nalias != NULL)
				1843	name = nalias;
				1844
				1845	/*
				1846	* Check first for directly registered encoding names
				1847	*/
				1848	for (i = 0;i < 99;i++) {
				1849	upper[i] = toupper(name[i]);
				1850	if (upper[i] == 0) break;
				1851	}
				1852	upper[i] = 0;
				1853
				1854	for (i = 0;i < nbCharEncodingHandler; i++)
				1855	if (!strcmp(upper, handlers[i]->name)) {
				1856	#ifdef DEBUG_ENCODING
				1857	xmlGenericError(xmlGenericErrorContext,
				1858	"Found registered handler for encoding %s\n", name);
				1859	#endif
				1860	return(handlers[i]);
				1861	}
				1862
				1863	#ifdef LIBXML_ICONV_ENABLED
				1864	/* check whether iconv can handle this */
				1865	icv_in = iconv_open("UTF-8", name);
				1866	icv_out = iconv_open(name, "UTF-8");
				1867	if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
				1868	enc = (xmlCharEncodingHandlerPtr)
				1869	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1870	if (enc == NULL) {
				1871	iconv_close(icv_in);
				1872	iconv_close(icv_out);
				1873	return(NULL);
				1874	}
				1875	enc->name = xmlMemStrdup(name);
				1876	enc->input = NULL;
				1877	enc->output = NULL;
				1878	enc->iconv_in = icv_in;
				1879	enc->iconv_out = icv_out;
				1880	#ifdef DEBUG_ENCODING
				1881	xmlGenericError(xmlGenericErrorContext,
				1882	"Found iconv handler for encoding %s\n", name);
				1883	#endif
				1884	return enc;
				1885	} else if ((icv_in != (iconv_t) -1) \|\| icv_out != (iconv_t) -1) {
				1886	xmlGenericError(xmlGenericErrorContext,
				1887	"iconv : problems with filters for '%s'\n", name);
				1888	}
				1889	#endif /* LIBXML_ICONV_ENABLED */
				1890
				1891	#ifdef DEBUG_ENCODING
				1892	xmlGenericError(xmlGenericErrorContext,
				1893	"No handler found for encoding %s\n", name);
				1894	#endif
				1895
				1896	/*
				1897	* Fallback using the canonical names
				1898	*/
				1899	alias = xmlParseCharEncoding(norig);
				1900	if (alias != XML_CHAR_ENCODING_ERROR) {
				1901	const char* canon;
				1902	canon = xmlGetCharEncodingName(alias);
				1903	if ((canon != NULL) && (strcmp(name, canon))) {
				1904	return(xmlFindCharEncodingHandler(canon));
				1905	}
				1906	}
				1907
				1908	return(NULL);
				1909	}
				1910
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1911	/************************************************************************
				1912	* *
				1913	* ICONV based generic conversion functions *
				1914	* *
				1915	************************************************************************/
				1916
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1917	#ifdef LIBXML_ICONV_ENABLED
				1918	/**
				1919	* xmlIconvWrapper:
				1920	* @cd: iconv converter data structure
				1921	* @out: a pointer to an array of bytes to store the result
				1922	* @outlen: the length of @out
				1923	* @in: a pointer to an array of ISO Latin 1 chars
				1924	* @inlen: the length of @in
				1925	*
				1926	* Returns 0 if success, or
				1927	* -1 by lack of space, or
				1928	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1929	* the result of transformation can't fit into the encoding we want), or
				1930	* -3 if there the last byte can't form a single output char.
				1931	*
				1932	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1933	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1934	* The value of @outlen after return is the number of ocetes consumed.
				1935	*/
				1936	static int
				1937	xmlIconvWrapper(iconv_t cd,
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1938	unsigned char out, int outlen,
				1939	const unsigned char in, int inlen) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1940
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1941	size_t icv_inlen = inlen, icv_outlen = outlen;
				1942	const char icv_in = (const char ) in;
				1943	char icv_out = (char ) out;
				1944	int ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1945
Darin Adler	699613b	2001-07-27 22:47:14 +0000	[diff] [blame]	1946	ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1947	if (in != NULL) {
				1948	*inlen -= icv_inlen;
				1949	*outlen -= icv_outlen;
				1950	} else {
				1951	*inlen = 0;
				1952	*outlen = 0;
				1953	}
				1954	if ((icv_inlen != 0) \|\| (ret == -1)) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1955	#ifdef EILSEQ
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1956	if (errno == EILSEQ) {
				1957	return -2;
				1958	} else
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1959	#endif
				1960	#ifdef E2BIG
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1961	if (errno == E2BIG) {
				1962	return -1;
				1963	} else
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1964	#endif
				1965	#ifdef EINVAL
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1966	if (errno == EINVAL) {
				1967	return -3;
				1968	} else
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1969	#endif
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1970	{
				1971	return -3;
				1972	}
				1973	}
				1974	return 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1975	}
				1976	#endif /* LIBXML_ICONV_ENABLED */
				1977
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1978	/************************************************************************
				1979	* *
				1980	* The real API used by libxml for on-the-fly conversion *
				1981	* *
				1982	************************************************************************/
				1983
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1984	/**
				1985	* xmlCharEncFirstLine:
				1986	* @handler: char enconding transformation data structure
				1987	* @out: an xmlBuffer for the output.
				1988	* @in: an xmlBuffer for the input
				1989	*
				1990	* Front-end for the encoding handler input function, but handle only
				1991	* the very first line, i.e. limit itself to 45 chars.
				1992	*
				1993	* Returns the number of byte written if success, or
				1994	* -1 general error
				1995	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1996	* the result of transformation can't fit into the encoding we want), or
				1997	*/
				1998	int
				1999	xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				2000	xmlBufferPtr in) {
				2001	int ret = -2;
				2002	int written;
				2003	int toconv;
				2004
				2005	if (handler == NULL) return(-1);
				2006	if (out == NULL) return(-1);
				2007	if (in == NULL) return(-1);
				2008
				2009	written = out->size - out->use;
				2010	toconv = in->use;
				2011	if (toconv * 2 >= written) {
				2012	xmlBufferGrow(out, toconv);
				2013	written = out->size - out->use - 1;
				2014	}
				2015
				2016	/*
				2017	* echo '<?xml version="1.0" encoding="UCS4"?>' \| wc -c => 38
				2018	* 45 chars should be sufficient to reach the end of the encoding
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2019	* declaration without going too far inside the document content.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2020	*/
				2021	written = 45;
				2022
				2023	if (handler->input != NULL) {
				2024	ret = handler->input(&out->content[out->use], &written,
				2025	in->content, &toconv);
				2026	xmlBufferShrink(in, toconv);
				2027	out->use += written;
				2028	out->content[out->use] = 0;
				2029	}
				2030	#ifdef LIBXML_ICONV_ENABLED
				2031	else if (handler->iconv_in != NULL) {
				2032	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				2033	&written, in->content, &toconv);
				2034	xmlBufferShrink(in, toconv);
				2035	out->use += written;
				2036	out->content[out->use] = 0;
				2037	if (ret == -1) ret = -3;
				2038	}
				2039	#endif /* LIBXML_ICONV_ENABLED */
				2040	#ifdef DEBUG_ENCODING
				2041	switch (ret) {
				2042	case 0:
				2043	xmlGenericError(xmlGenericErrorContext,
				2044	"converted %d bytes to %d bytes of input\n",
				2045	toconv, written);
				2046	break;
				2047	case -1:
				2048	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				2049	toconv, written, in->use);
				2050	break;
				2051	case -2:
				2052	xmlGenericError(xmlGenericErrorContext,
				2053	"input conversion failed due to input error\n");
				2054	break;
				2055	case -3:
				2056	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				2057	toconv, written, in->use);
				2058	break;
				2059	default:
				2060	xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
				2061	}
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2062	#endif /* DEBUG_ENCODING */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2063	/*
				2064	* Ignore when input buffer is not on a boundary
				2065	*/
				2066	if (ret == -3) ret = 0;
				2067	if (ret == -1) ret = 0;
				2068	return(ret);
				2069	}
				2070
				2071	/**
				2072	* xmlCharEncInFunc:
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2073	* @handler: char encoding transformation data structure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2074	* @out: an xmlBuffer for the output.
				2075	* @in: an xmlBuffer for the input
				2076	*
				2077	* Generic front-end for the encoding handler input function
				2078	*
				2079	* Returns the number of byte written if success, or
				2080	* -1 general error
				2081	* -2 if the transcoding fails (for *in is not valid utf8 string or
				2082	* the result of transformation can't fit into the encoding we want), or
				2083	*/
				2084	int
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2085	xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
				2086	xmlBufferPtr in)
				2087	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2088	int ret = -2;
				2089	int written;
				2090	int toconv;
				2091
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2092	if (handler == NULL)
				2093	return (-1);
				2094	if (out == NULL)
				2095	return (-1);
				2096	if (in == NULL)
				2097	return (-1);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2098
				2099	toconv = in->use;
				2100	if (toconv == 0)
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2101	return (0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2102	written = out->size - out->use;
				2103	if (toconv * 2 >= written) {
				2104	xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2105	written = out->size - out->use - 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2106	}
				2107	if (handler->input != NULL) {
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2108	ret = handler->input(&out->content[out->use], &written,
				2109	in->content, &toconv);
				2110	xmlBufferShrink(in, toconv);
				2111	out->use += written;
				2112	out->content[out->use] = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2113	}
				2114	#ifdef LIBXML_ICONV_ENABLED
				2115	else if (handler->iconv_in != NULL) {
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2116	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				2117	&written, in->content, &toconv);
				2118	xmlBufferShrink(in, toconv);
				2119	out->use += written;
				2120	out->content[out->use] = 0;
				2121	if (ret == -1)
				2122	ret = -3;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2123	}
				2124	#endif /* LIBXML_ICONV_ENABLED */
				2125	switch (ret) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2126	case 0:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2127	#ifdef DEBUG_ENCODING
				2128	xmlGenericError(xmlGenericErrorContext,
				2129	"converted %d bytes to %d bytes of input\n",
				2130	toconv, written);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2131	#endif
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2132	break;
				2133	case -1:
				2134	#ifdef DEBUG_ENCODING
				2135	xmlGenericError(xmlGenericErrorContext,
				2136	"converted %d bytes to %d bytes of input, %d left\n",
				2137	toconv, written, in->use);
				2138	#endif
				2139	break;
				2140	case -3:
				2141	#ifdef DEBUG_ENCODING
				2142	xmlGenericError(xmlGenericErrorContext,
				2143	"converted %d bytes to %d bytes of input, %d left\n",
				2144	toconv, written, in->use);
				2145	#endif
				2146	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2147	case -2:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2148	xmlGenericError(xmlGenericErrorContext,
				2149	"input conversion failed due to input error\n");
				2150	xmlGenericError(xmlGenericErrorContext,
				2151	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2152	in->content[0], in->content[1],
				2153	in->content[2], in->content[3]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2154	}
				2155	/*
				2156	* Ignore when input buffer is not on a boundary
				2157	*/
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2158	if (ret == -3)
				2159	ret = 0;
Daniel Veillard	d076a20	2002-11-20 13:28:31 +0000	[diff] [blame]	2160	return (written);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2161	}
				2162
				2163	/**
				2164	* xmlCharEncOutFunc:
				2165	* @handler: char enconding transformation data structure
				2166	* @out: an xmlBuffer for the output.
				2167	* @in: an xmlBuffer for the input
				2168	*
				2169	* Generic front-end for the encoding handler output function
				2170	* a first call with @in == NULL has to be made firs to initiate the
				2171	* output in case of non-stateless encoding needing to initiate their
				2172	* state or the output (like the BOM in UTF16).
				2173	* In case of UTF8 sequence conversion errors for the given encoder,
				2174	* the content will be automatically remapped to a CharRef sequence.
				2175	*
				2176	* Returns the number of byte written if success, or
				2177	* -1 general error
				2178	* -2 if the transcoding fails (for *in is not valid utf8 string or
				2179	* the result of transformation can't fit into the encoding we want), or
				2180	*/
				2181	int
				2182	xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				2183	xmlBufferPtr in) {
				2184	int ret = -2;
				2185	int written;
				2186	int writtentot = 0;
				2187	int toconv;
				2188	int output = 0;
				2189
				2190	if (handler == NULL) return(-1);
				2191	if (out == NULL) return(-1);
				2192
				2193	retry:
				2194
				2195	written = out->size - out->use;
				2196
Igor Zlatkovic	73267db	2003-03-08 13:29:24 +0000	[diff] [blame^]	2197	if (written > 0)
				2198	written--; /* Gennady: count '/0' */
				2199
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2200	/*
				2201	* First specific handling of in = NULL, i.e. the initialization call
				2202	*/
				2203	if (in == NULL) {
				2204	toconv = 0;
				2205	if (handler->output != NULL) {
				2206	ret = handler->output(&out->content[out->use], &written,
				2207	NULL, &toconv);
Igor Zlatkovic	73267db	2003-03-08 13:29:24 +0000	[diff] [blame^]	2208	if (ret == 0) { /* Gennady: check return value */
				2209	out->use += written;
				2210	out->content[out->use] = 0;
				2211	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2212	}
				2213	#ifdef LIBXML_ICONV_ENABLED
				2214	else if (handler->iconv_out != NULL) {
				2215	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				2216	&written, NULL, &toconv);
				2217	out->use += written;
				2218	out->content[out->use] = 0;
				2219	}
				2220	#endif /* LIBXML_ICONV_ENABLED */
				2221	#ifdef DEBUG_ENCODING
				2222	xmlGenericError(xmlGenericErrorContext,
				2223	"initialized encoder\n");
				2224	#endif
				2225	return(0);
				2226	}
				2227
				2228	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2229	* Conversion itself.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2230	*/
				2231	toconv = in->use;
				2232	if (toconv == 0)
				2233	return(0);
				2234	if (toconv * 2 >= written) {
				2235	xmlBufferGrow(out, toconv * 2);
				2236	written = out->size - out->use - 1;
				2237	}
				2238	if (handler->output != NULL) {
				2239	ret = handler->output(&out->content[out->use], &written,
				2240	in->content, &toconv);
				2241	xmlBufferShrink(in, toconv);
				2242	out->use += written;
				2243	writtentot += written;
				2244	out->content[out->use] = 0;
				2245	}
				2246	#ifdef LIBXML_ICONV_ENABLED
				2247	else if (handler->iconv_out != NULL) {
				2248	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				2249	&written, in->content, &toconv);
				2250	xmlBufferShrink(in, toconv);
				2251	out->use += written;
				2252	writtentot += written;
				2253	out->content[out->use] = 0;
				2254	if (ret == -1) {
				2255	if (written > 0) {
				2256	/*
				2257	* Can be a limitation of iconv
				2258	*/
				2259	goto retry;
				2260	}
				2261	ret = -3;
				2262	}
				2263	}
				2264	#endif /* LIBXML_ICONV_ENABLED */
				2265	else {
				2266	xmlGenericError(xmlGenericErrorContext,
				2267	"xmlCharEncOutFunc: no output function !\n");
				2268	return(-1);
				2269	}
				2270
				2271	if (ret >= 0) output += ret;
				2272
				2273	/*
				2274	* Attempt to handle error cases
				2275	*/
				2276	switch (ret) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2277	case 0:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2278	#ifdef DEBUG_ENCODING
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2279	xmlGenericError(xmlGenericErrorContext,
				2280	"converted %d bytes to %d bytes of output\n",
				2281	toconv, written);
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2282	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2283	break;
				2284	case -1:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2285	#ifdef DEBUG_ENCODING
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2286	xmlGenericError(xmlGenericErrorContext,
				2287	"output conversion failed by lack of space\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2288	#endif
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2289	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2290	case -3:
Daniel Veillard	809faa5	2003-02-10 15:43:53 +0000	[diff] [blame]	2291	#ifdef DEBUG_ENCODING
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2292	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
				2293	toconv, written, in->use);
Daniel Veillard	809faa5	2003-02-10 15:43:53 +0000	[diff] [blame]	2294	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2295	break;
				2296	case -2: {
				2297	int len = in->use;
				2298	const xmlChar utf = (const xmlChar ) in->content;
				2299	int cur;
				2300
				2301	cur = xmlGetUTF8Char(utf, &len);
				2302	if (cur > 0) {
				2303	xmlChar charref[20];
				2304
				2305	#ifdef DEBUG_ENCODING
				2306	xmlGenericError(xmlGenericErrorContext,
				2307	"handling output conversion error\n");
				2308	xmlGenericError(xmlGenericErrorContext,
				2309	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2310	in->content[0], in->content[1],
				2311	in->content[2], in->content[3]);
				2312	#endif
				2313	/*
				2314	* Removes the UTF8 sequence, and replace it by a charref
				2315	* and continue the transcoding phase, hoping the error
				2316	* did not mangle the encoder state.
				2317	*/
Aleksey Sanin	49cc975	2002-06-14 17:07:10 +0000	[diff] [blame]	2318	snprintf((char *) charref, sizeof(charref), "&#%d;", cur);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2319	xmlBufferShrink(in, len);
				2320	xmlBufferAddHead(in, charref, -1);
				2321
				2322	goto retry;
				2323	} else {
				2324	xmlGenericError(xmlGenericErrorContext,
				2325	"output conversion failed due to conv error\n");
				2326	xmlGenericError(xmlGenericErrorContext,
				2327	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2328	in->content[0], in->content[1],
				2329	in->content[2], in->content[3]);
				2330	in->content[0] = ' ';
				2331	}
				2332	break;
				2333	}
				2334	}
				2335	return(ret);
				2336	}
				2337
				2338	/**
				2339	* xmlCharEncCloseFunc:
				2340	* @handler: char enconding transformation data structure
				2341	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2342	* Generic front-end for encoding handler close function
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2343	*
				2344	* Returns 0 if success, or -1 in case of error
				2345	*/
				2346	int
				2347	xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
				2348	int ret = 0;
				2349	if (handler == NULL) return(-1);
				2350	if (handler->name == NULL) return(-1);
				2351	#ifdef LIBXML_ICONV_ENABLED
				2352	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2353	* Iconv handlers can be used only once, free the whole block.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2354	* and the associated icon resources.
				2355	*/
				2356	if ((handler->iconv_out != NULL) \|\| (handler->iconv_in != NULL)) {
				2357	if (handler->name != NULL)
				2358	xmlFree(handler->name);
				2359	handler->name = NULL;
				2360	if (handler->iconv_out != NULL) {
				2361	if (iconv_close(handler->iconv_out))
				2362	ret = -1;
				2363	handler->iconv_out = NULL;
				2364	}
				2365	if (handler->iconv_in != NULL) {
				2366	if (iconv_close(handler->iconv_in))
				2367	ret = -1;
				2368	handler->iconv_in = NULL;
				2369	}
				2370	xmlFree(handler);
				2371	}
				2372	#endif /* LIBXML_ICONV_ENABLED */
				2373	#ifdef DEBUG_ENCODING
				2374	if (ret)
				2375	xmlGenericError(xmlGenericErrorContext,
				2376	"failed to close the encoding handler\n");
				2377	else
				2378	xmlGenericError(xmlGenericErrorContext,
				2379	"closed the encoding handler\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2380	#endif
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2381
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2382	return(ret);
				2383	}
				2384