Blame - encoding.c - platform/external/libxml2

blob: 7d1a97164f80cd239b1e6708c730ed2b1bf885c7 [file] [log] [blame]

Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1	/*
				2	* encoding.c : implements the encoding conversion functions needed for XML
				3	*
				4	* Related specs:
				5	* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
				6	* rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
				7	* [ISO-10646] UTF-8 and UTF-16 in Annexes
				8	* [ISO-8859-1] ISO Latin-1 characters codes.
				9	* [UNICODE] The Unicode Consortium, "The Unicode Standard --
				10	* Worldwide Character Encoding -- Version 1.0", Addison-
				11	* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
				12	* described in Unicode Technical Report #4.
				13	* [US-ASCII] Coded Character Set--7-bit American Standard Code for
				14	* Information Interchange, ANSI X3.4-1986.
				15	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	16	* See Copyright for the status of this software.
				17	*
Daniel Veillard	c5d6434	2001-06-24 12:13:24 +0000	[diff] [blame]	18	* daniel@veillard.com
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	19	*
				20	* UTF8 string routines from:
				21	* "William M. Brack" <wbrack@mmm.com.hk>
				22	*
				23	* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	24	*/
				25
Daniel Veillard	34ce8be	2002-03-18 19:37:11 +0000	[diff] [blame]	26	#define IN_LIBXML
Bjorn Reese	70a9da5	2001-04-21 16:57:29 +0000	[diff] [blame]	27	#include "libxml.h"
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	28
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	29	#include <string.h>
				30
				31	#ifdef HAVE_CTYPE_H
				32	#include <ctype.h>
				33	#endif
				34	#ifdef HAVE_STDLIB_H
				35	#include <stdlib.h>
				36	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	37	#ifdef LIBXML_ICONV_ENABLED
				38	#ifdef HAVE_ERRNO_H
				39	#include <errno.h>
				40	#endif
				41	#endif
				42	#include <libxml/encoding.h>
				43	#include <libxml/xmlmemory.h>
				44	#ifdef LIBXML_HTML_ENABLED
				45	#include <libxml/HTMLparser.h>
				46	#endif
Daniel Veillard	64a411c	2001-10-15 12:32:07 +0000	[diff] [blame]	47	#include <libxml/globals.h>
Daniel Veillard	a4617b8	2001-11-04 20:19:12 +0000	[diff] [blame]	48	#include <libxml/xmlerror.h>
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	49
Daniel Veillard	2209073	2001-07-16 00:06:07 +0000	[diff] [blame]	50	static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
				51	static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	52
				53	typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
				54	typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
				55	struct _xmlCharEncodingAlias {
				56	const char *name;
				57	const char *alias;
				58	};
				59
				60	static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
				61	static int xmlCharEncodingAliasesNb = 0;
				62	static int xmlCharEncodingAliasesMax = 0;
				63
				64	#ifdef LIBXML_ICONV_ENABLED
				65	#if 0
				66	#define DEBUG_ENCODING /* Define this to get encoding traces */
				67	#endif
				68	#endif
				69
				70	static int xmlLittleEndian = 1;
				71
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	72	/************************************************************************
				73	* *
				74	* Generic UTF8 handling routines *
				75	* *
				76	* From rfc2044: encoding of the Unicode values on UTF-8: *
				77	* *
				78	* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
				79	* 0000 0000-0000 007F 0xxxxxxx *
				80	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
				81	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
				82	* *
				83	* I hope we won't use values > 0xFFFF anytime soon ! *
				84	* *
				85	************************************************************************/
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	86
				87	/**
William M. Brack	4a557d9	2003-07-29 04:28:04 +0000	[diff] [blame]	88	* xmlUTF8Size:
				89	* @utf: pointer to the UTF8 character
				90	*
				91	* returns the numbers of bytes in the character, -1 on format error
				92	*/
				93	int
				94	xmlUTF8Size(const xmlChar *utf) {
				95	xmlChar mask;
				96	int len;
				97
				98	if (utf == NULL)
				99	return -1;
				100	if (*utf < 0x80)
				101	return 1;
				102	/* check valid UTF8 character */
				103	if (!(*utf & 0x40))
				104	return -1;
				105	/* determine number of bytes in char */
				106	len = 2;
				107	for (mask=0x20; mask != 0; mask>>=1) {
				108	if (!(*utf & mask))
				109	return len;
				110	len++;
				111	}
				112	return -1;
				113	}
				114
				115	/**
				116	* xmlUTF8Charcmp
				117	* @utf1: pointer to first UTF8 char
				118	* @utf2: pointer to second UTF8 char
				119	*
				120	* returns result of comparing the two UCS4 values
				121	* as with xmlStrncmp
				122	*/
				123	int
				124	xmlUTF8Charcmp(const xmlChar utf1, const xmlChar utf2) {
				125
				126	if (utf1 == NULL ) {
				127	if (utf2 == NULL)
				128	return 0;
				129	return -1;
				130	}
Daniel Veillard	9ff7de1	2003-07-29 13:30:42 +0000	[diff] [blame]	131	return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
William M. Brack	4a557d9	2003-07-29 04:28:04 +0000	[diff] [blame]	132	}
				133
				134	/**
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	135	* xmlUTF8Strlen:
				136	* @utf: a sequence of UTF-8 encoded bytes
				137	*
Daniel Veillard	60087f3	2001-10-10 09:45:09 +0000	[diff] [blame]	138	* compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	139	* checking of the content of the string.
				140	*
				141	* Returns the number of characters in the string or -1 in case of error
				142	*/
				143	int
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	144	xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillard	e043ee1	2001-04-16 14:08:07 +0000	[diff] [blame]	145	int ret = 0;
				146
				147	if (utf == NULL)
				148	return(-1);
				149
				150	while (*utf != 0) {
				151	if (utf[0] & 0x80) {
				152	if ((utf[1] & 0xc0) != 0x80)
				153	return(-1);
				154	if ((utf[0] & 0xe0) == 0xe0) {
				155	if ((utf[2] & 0xc0) != 0x80)
				156	return(-1);
				157	if ((utf[0] & 0xf0) == 0xf0) {
				158	if ((utf[0] & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				159	return(-1);
				160	utf += 4;
				161	} else {
				162	utf += 3;
				163	}
				164	} else {
				165	utf += 2;
				166	}
				167	} else {
				168	utf++;
				169	}
				170	ret++;
				171	}
				172	return(ret);
				173	}
				174
				175	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	176	* xmlGetUTF8Char:
				177	* @utf: a sequence of UTF-8 encoded bytes
				178	* @len: a pointer to @bytes len
				179	*
				180	* Read one UTF8 Char from @utf
				181	*
				182	* Returns the char value or -1 in case of error and update @len with the
				183	* number of bytes used
				184	*/
Daniel Veillard	f000f07	2002-10-22 14:28:17 +0000	[diff] [blame]	185	int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	186	xmlGetUTF8Char(const unsigned char utf, int len) {
				187	unsigned int c;
				188
				189	if (utf == NULL)
				190	goto error;
				191	if (len == NULL)
				192	goto error;
				193	if (*len < 1)
				194	goto error;
				195
				196	c = utf[0];
				197	if (c & 0x80) {
				198	if (*len < 2)
				199	goto error;
				200	if ((utf[1] & 0xc0) != 0x80)
				201	goto error;
				202	if ((c & 0xe0) == 0xe0) {
				203	if (*len < 3)
				204	goto error;
				205	if ((utf[2] & 0xc0) != 0x80)
				206	goto error;
				207	if ((c & 0xf0) == 0xf0) {
				208	if (*len < 4)
				209	goto error;
				210	if ((c & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				211	goto error;
				212	*len = 4;
				213	/* 4-byte code */
				214	c = (utf[0] & 0x7) << 18;
				215	c \|= (utf[1] & 0x3f) << 12;
				216	c \|= (utf[2] & 0x3f) << 6;
				217	c \|= utf[3] & 0x3f;
				218	} else {
				219	/* 3-byte code */
				220	*len = 3;
				221	c = (utf[0] & 0xf) << 12;
				222	c \|= (utf[1] & 0x3f) << 6;
				223	c \|= utf[2] & 0x3f;
				224	}
				225	} else {
				226	/* 2-byte code */
				227	*len = 2;
				228	c = (utf[0] & 0x1f) << 6;
				229	c \|= utf[1] & 0x3f;
				230	}
				231	} else {
				232	/* 1-byte code */
				233	*len = 1;
				234	}
				235	return(c);
				236
				237	error:
				238	*len = 0;
				239	return(-1);
				240	}
				241
				242	/**
Daniel Veillard	01c13b5	2002-12-10 15:19:08 +0000	[diff] [blame]	243	* xmlCheckUTF8:
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	244	* @utf: Pointer to putative utf-8 encoded string.
				245	*
				246	* Checks @utf for being valid utf-8. @utf is assumed to be
				247	* null-terminated. This function is not super-strict, as it will
				248	* allow longer utf-8 sequences than necessary. Note that Java is
				249	* capable of producing these sequences if provoked. Also note, this
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	250	* routine checks for the 4-byte maximum size, but does not check for
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	251	* 0x10ffff maximum value.
				252	*
				253	* Return value: true if @utf is valid.
				254	**/
				255	int
				256	xmlCheckUTF8(const unsigned char *utf)
				257	{
				258	int ix;
				259	unsigned char c;
				260
				261	for (ix = 0; (c = utf[ix]);) {
				262	if (c & 0x80) {
				263	if ((utf[ix + 1] & 0xc0) != 0x80)
				264	return(0);
				265	if ((c & 0xe0) == 0xe0) {
				266	if ((utf[ix + 2] & 0xc0) != 0x80)
				267	return(0);
				268	if ((c & 0xf0) == 0xf0) {
				269	if ((c & 0xf8) != 0xf0 \|\| (utf[ix + 3] & 0xc0) != 0x80)
				270	return(0);
				271	ix += 4;
				272	/* 4-byte code */
				273	} else
				274	/* 3-byte code */
				275	ix += 3;
				276	} else
				277	/* 2-byte code */
				278	ix += 2;
				279	} else
				280	/* 1-byte code */
				281	ix++;
				282	}
				283	return(1);
				284	}
				285
				286	/**
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	287	* xmlUTF8Strsize:
				288	* @utf: a sequence of UTF-8 encoded bytes
				289	* @len: the number of characters in the array
				290	*
				291	* storage size of an UTF8 string
				292	*
				293	* Returns the storage size of
				294	* the first 'len' characters of ARRAY
				295	*
				296	*/
				297
				298	int
				299	xmlUTF8Strsize(const xmlChar *utf, int len) {
				300	const xmlChar *ptr=utf;
				301	xmlChar ch;
				302
				303	if (len <= 0)
				304	return(0);
				305
				306	while ( len-- > 0) {
				307	if ( !*ptr )
				308	break;
				309	if ( (ch = *ptr++) & 0x80)
				310	while ( (ch<<=1) & 0x80 )
				311	ptr++;
				312	}
				313	return (ptr - utf);
				314	}
				315
				316
				317	/**
				318	* xmlUTF8Strndup:
				319	* @utf: the input UTF8 *
				320	* @len: the len of @utf (in chars)
				321	*
				322	* a strndup for array of UTF8's
				323	*
				324	* Returns a new UTF8 * or NULL
				325	*/
				326	xmlChar *
				327	xmlUTF8Strndup(const xmlChar *utf, int len) {
				328	xmlChar *ret;
				329	int i;
				330
				331	if ((utf == NULL) \|\| (len < 0)) return(NULL);
				332	i = xmlUTF8Strsize(utf, len);
Daniel Veillard	3c908dc	2003-04-19 00:07:51 +0000	[diff] [blame]	333	ret = (xmlChar ) xmlMallocAtomic((i + 1) sizeof(xmlChar));
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	334	if (ret == NULL) {
				335	xmlGenericError(xmlGenericErrorContext,
				336	"malloc of %ld byte failed\n",
				337	(len + 1) * (long)sizeof(xmlChar));
				338	return(NULL);
				339	}
				340	memcpy(ret, utf, i * sizeof(xmlChar));
				341	ret[i] = 0;
				342	return(ret);
				343	}
				344
				345	/**
				346	* xmlUTF8Strpos:
				347	* @utf: the input UTF8 *
				348	* @pos: the position of the desired UTF8 char (in chars)
				349	*
				350	* a function to provide the equivalent of fetching a
				351	* character from a string array
				352	*
				353	* Returns a pointer to the UTF8 character or NULL
				354	*/
				355	xmlChar *
				356	xmlUTF8Strpos(const xmlChar *utf, int pos) {
				357	xmlChar ch;
				358
				359	if (utf == NULL) return(NULL);
				360	if ( (pos < 0) \|\| (pos >= xmlUTF8Strlen(utf)) )
				361	return(NULL);
				362	while (pos--) {
				363	if ((ch=*utf++) == 0) return(NULL);
				364	if ( ch & 0x80 ) {
				365	/* if not simple ascii, verify proper format */
				366	if ( (ch & 0xc0) != 0xc0 )
				367	return(NULL);
				368	/* then skip over remaining bytes for this char */
				369	while ( (ch <<= 1) & 0x80 )
				370	if ( (*utf++ & 0xc0) != 0x80 )
				371	return(NULL);
				372	}
				373	}
				374	return((xmlChar *)utf);
				375	}
				376
				377	/**
				378	* xmlUTF8Strloc:
				379	* @utf: the input UTF8 *
				380	* @utfchar: the UTF8 character to be found
				381	*
				382	* a function to provide relative location of a UTF8 char
				383	*
				384	* Returns the relative character position of the desired char
				385	* or -1 if not found
				386	*/
				387	int
				388	xmlUTF8Strloc(const xmlChar utf, const xmlChar utfchar) {
				389	int i, size;
				390	xmlChar ch;
				391
				392	if (utf==NULL \|\| utfchar==NULL) return -1;
				393	size = xmlUTF8Strsize(utfchar, 1);
				394	for(i=0; (ch=*utf) != 0; i++) {
				395	if (xmlStrncmp(utf, utfchar, size)==0)
				396	return(i);
				397	utf++;
				398	if ( ch & 0x80 ) {
				399	/* if not simple ascii, verify proper format */
				400	if ( (ch & 0xc0) != 0xc0 )
				401	return(-1);
				402	/* then skip over remaining bytes for this char */
				403	while ( (ch <<= 1) & 0x80 )
				404	if ( (*utf++ & 0xc0) != 0x80 )
				405	return(-1);
				406	}
				407	}
				408
				409	return(-1);
				410	}
				411	/**
				412	* xmlUTF8Strsub:
				413	* @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	414	* @start: relative pos of first char
				415	* @len: total number to copy
				416	*
				417	* Note: positions are given in units of UTF-8 chars
				418	*
				419	* Returns a pointer to a newly created string
				420	* or NULL if any problem
				421	*/
				422
				423	xmlChar *
				424	xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
				425	int i;
				426	xmlChar ch;
				427
				428	if (utf == NULL) return(NULL);
				429	if (start < 0) return(NULL);
				430	if (len < 0) return(NULL);
				431
				432	/*
				433	* Skip over any leading chars
				434	*/
				435	for (i = 0;i < start;i++) {
				436	if ((ch=*utf++) == 0) return(NULL);
				437	if ( ch & 0x80 ) {
				438	/* if not simple ascii, verify proper format */
				439	if ( (ch & 0xc0) != 0xc0 )
				440	return(NULL);
				441	/* then skip over remaining bytes for this char */
				442	while ( (ch <<= 1) & 0x80 )
				443	if ( (*utf++ & 0xc0) != 0x80 )
				444	return(NULL);
				445	}
				446	}
				447
				448	return(xmlUTF8Strndup(utf, len));
				449	}
				450
				451	/************************************************************************
				452	* *
				453	* Conversions To/From UTF8 encoding *
				454	* *
				455	************************************************************************/
				456
				457	/**
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	458	* asciiToUTF8:
				459	* @out: a pointer to an array of bytes to store the result
				460	* @outlen: the length of @out
				461	* @in: a pointer to an array of ASCII chars
				462	* @inlen: the length of @in
				463	*
				464	* Take a block of ASCII chars in and try to convert it to an UTF-8
				465	* block of chars out.
				466	* Returns 0 if success, or -1 otherwise
				467	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	468	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	469	* The value of @outlen after return is the number of ocetes consumed.
				470	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	471	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	472	asciiToUTF8(unsigned char* out, int *outlen,
				473	const unsigned char* in, int *inlen) {
				474	unsigned char* outstart = out;
				475	const unsigned char* base = in;
				476	const unsigned char* processed = in;
				477	unsigned char* outend = out + *outlen;
				478	const unsigned char* inend;
				479	unsigned int c;
				480	int bits;
				481
				482	inend = in + (*inlen);
				483	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				484	c= *in++;
				485
				486	/* assertion: c is a single UTF-4 value */
				487	if (out >= outend)
				488	break;
				489	if (c < 0x80) { *out++= c; bits= -6; }
				490	else {
				491	*outlen = out - outstart;
				492	*inlen = processed - base;
				493	return(-1);
				494	}
				495
				496	for ( ; bits >= 0; bits-= 6) {
				497	if (out >= outend)
				498	break;
				499	*out++= ((c >> bits) & 0x3F) \| 0x80;
				500	}
				501	processed = (const unsigned char*) in;
				502	}
				503	*outlen = out - outstart;
				504	*inlen = processed - base;
				505	return(0);
				506	}
				507
				508	/**
				509	* UTF8Toascii:
				510	* @out: a pointer to an array of bytes to store the result
				511	* @outlen: the length of @out
				512	* @in: a pointer to an array of UTF-8 chars
				513	* @inlen: the length of @in
				514	*
				515	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				516	* block of chars out.
				517	*
				518	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				519	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	520	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	521	* The value of @outlen after return is the number of ocetes consumed.
				522	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	523	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	524	UTF8Toascii(unsigned char* out, int *outlen,
				525	const unsigned char* in, int *inlen) {
				526	const unsigned char* processed = in;
				527	const unsigned char* outend;
				528	const unsigned char* outstart = out;
				529	const unsigned char* instart = in;
				530	const unsigned char* inend;
				531	unsigned int c, d;
				532	int trailing;
				533
				534	if (in == NULL) {
				535	/*
				536	* initialization nothing to do
				537	*/
				538	*outlen = 0;
				539	*inlen = 0;
				540	return(0);
				541	}
				542	inend = in + (*inlen);
				543	outend = out + (*outlen);
				544	while (in < inend) {
				545	d = *in++;
				546	if (d < 0x80) { c= d; trailing= 0; }
				547	else if (d < 0xC0) {
				548	/* trailing byte in leading position */
				549	*outlen = out - outstart;
				550	*inlen = processed - instart;
				551	return(-2);
				552	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				553	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				554	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				555	else {
				556	/* no chance for this in Ascii */
				557	*outlen = out - outstart;
				558	*inlen = processed - instart;
				559	return(-2);
				560	}
				561
				562	if (inend - in < trailing) {
				563	break;
				564	}
				565
				566	for ( ; trailing; trailing--) {
				567	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				568	break;
				569	c <<= 6;
				570	c \|= d & 0x3F;
				571	}
				572
				573	/* assertion: c is a single UTF-4 value */
				574	if (c < 0x80) {
				575	if (out >= outend)
				576	break;
				577	*out++ = c;
				578	} else {
				579	/* no chance for this in Ascii */
				580	*outlen = out - outstart;
				581	*inlen = processed - instart;
				582	return(-2);
				583	}
				584	processed = in;
				585	}
				586	*outlen = out - outstart;
				587	*inlen = processed - instart;
				588	return(0);
				589	}
				590
				591	/**
				592	* isolat1ToUTF8:
				593	* @out: a pointer to an array of bytes to store the result
				594	* @outlen: the length of @out
				595	* @in: a pointer to an array of ISO Latin 1 chars
				596	* @inlen: the length of @in
				597	*
				598	* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
				599	* block of chars out.
				600	* Returns 0 if success, or -1 otherwise
				601	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	602	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	603	* The value of @outlen after return is the number of ocetes consumed.
				604	*/
				605	int
				606	isolat1ToUTF8(unsigned char* out, int *outlen,
				607	const unsigned char* in, int *inlen) {
				608	unsigned char* outstart = out;
				609	const unsigned char* base = in;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	610	unsigned char* outend = out + *outlen;
				611	const unsigned char* inend;
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	612	const unsigned char* instop;
				613	xmlChar c = *in;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	614
				615	inend = in + (*inlen);
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	616	instop = inend;
				617
				618	while (in < inend && out < outend - 1) {
				619	if (c >= 0x80) {
Daniel Veillard	db55291	2002-03-21 13:27:59 +0000	[diff] [blame]	620	*out++= ((c >> 6) & 0x1F) \| 0xC0;
Daniel Veillard	02141ea	2001-04-30 11:46:40 +0000	[diff] [blame]	621	*out++= (c & 0x3F) \| 0x80;
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	622	++in;
				623	c = *in;
				624	}
				625	if (instop - in > outend - out) instop = in + (outend - out);
				626	while (c < 0x80 && in < instop) {
				627	*out++ = c;
				628	++in;
				629	c = *in;
				630	}
				631	}
				632	if (in < inend && out < outend && c < 0x80) {
				633	*out++ = c;
				634	++in;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	635	}
				636	*outlen = out - outstart;
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	637	*inlen = in - base;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	638	return(0);
				639	}
				640
Daniel Veillard	81601f9	2003-01-14 13:42:37 +0000	[diff] [blame]	641	/**
				642	* UTF8ToUTF8:
				643	* @out: a pointer to an array of bytes to store the result
				644	* @outlen: the length of @out
				645	* @inb: a pointer to an array of UTF-8 chars
				646	* @inlenb: the length of @in in UTF-8 chars
				647	*
				648	* No op copy operation for UTF8 handling.
				649	*
				650	* Returns the number of byte written, or -1 by lack of space, or -2
				651	* if the transcoding fails (for *in is not valid utf16 string)
				652	* The value of *inlen after return is the number of octets consumed
				653	* as the return value is positive, else unpredictable.
				654	*/
				655	static int
				656	UTF8ToUTF8(unsigned char* out, int *outlen,
				657	const unsigned char* inb, int *inlenb)
				658	{
				659	int len;
				660
				661	if ((out == NULL) \|\| (inb == NULL) \|\| (outlen == NULL) \|\| (inlenb == NULL))
				662	return(-1);
				663	if (outlen > inlenb) {
				664	len = *inlenb;
				665	} else {
				666	len = *outlen;
				667	}
				668	if (len < 0)
				669	return(-1);
				670
				671	memcpy(out, inb, len);
				672
				673	*outlen = len;
				674	*inlenb = len;
				675	return(0);
				676	}
				677
Daniel Veillard	e72c756	2002-05-31 09:47:30 +0000	[diff] [blame]	678
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	679	/**
				680	* UTF8Toisolat1:
				681	* @out: a pointer to an array of bytes to store the result
				682	* @outlen: the length of @out
				683	* @in: a pointer to an array of UTF-8 chars
				684	* @inlen: the length of @in
				685	*
				686	* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
				687	* block of chars out.
				688	*
				689	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				690	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	691	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	692	* The value of @outlen after return is the number of ocetes consumed.
				693	*/
				694	int
				695	UTF8Toisolat1(unsigned char* out, int *outlen,
				696	const unsigned char* in, int *inlen) {
				697	const unsigned char* processed = in;
				698	const unsigned char* outend;
				699	const unsigned char* outstart = out;
				700	const unsigned char* instart = in;
				701	const unsigned char* inend;
				702	unsigned int c, d;
				703	int trailing;
				704
				705	if (in == NULL) {
				706	/*
				707	* initialization nothing to do
				708	*/
				709	*outlen = 0;
				710	*inlen = 0;
				711	return(0);
				712	}
				713	inend = in + (*inlen);
				714	outend = out + (*outlen);
				715	while (in < inend) {
				716	d = *in++;
				717	if (d < 0x80) { c= d; trailing= 0; }
				718	else if (d < 0xC0) {
				719	/* trailing byte in leading position */
				720	*outlen = out - outstart;
				721	*inlen = processed - instart;
				722	return(-2);
				723	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				724	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				725	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				726	else {
				727	/* no chance for this in IsoLat1 */
				728	*outlen = out - outstart;
				729	*inlen = processed - instart;
				730	return(-2);
				731	}
				732
				733	if (inend - in < trailing) {
				734	break;
				735	}
				736
				737	for ( ; trailing; trailing--) {
				738	if (in >= inend)
				739	break;
				740	if (((d= *in++) & 0xC0) != 0x80) {
				741	*outlen = out - outstart;
				742	*inlen = processed - instart;
				743	return(-2);
				744	}
				745	c <<= 6;
				746	c \|= d & 0x3F;
				747	}
				748
				749	/* assertion: c is a single UTF-4 value */
				750	if (c <= 0xFF) {
				751	if (out >= outend)
				752	break;
				753	*out++ = c;
				754	} else {
				755	/* no chance for this in IsoLat1 */
				756	*outlen = out - outstart;
				757	*inlen = processed - instart;
				758	return(-2);
				759	}
				760	processed = in;
				761	}
				762	*outlen = out - outstart;
				763	*inlen = processed - instart;
				764	return(0);
				765	}
				766
				767	/**
				768	* UTF16LEToUTF8:
				769	* @out: a pointer to an array of bytes to store the result
				770	* @outlen: the length of @out
				771	* @inb: a pointer to an array of UTF-16LE passwd as a byte array
				772	* @inlenb: the length of @in in UTF-16LE chars
				773	*
				774	* Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	775	* block of chars out. This function assume the endian property
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	776	* is the same between the native type of this machine and the
				777	* inputed one.
				778	*
				779	* Returns the number of byte written, or -1 by lack of space, or -2
				780	* if the transcoding fails (for *in is not valid utf16 string)
				781	* The value of *inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	782	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	783	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	784	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	785	UTF16LEToUTF8(unsigned char* out, int *outlen,
				786	const unsigned char* inb, int *inlenb)
				787	{
				788	unsigned char* outstart = out;
				789	const unsigned char* processed = inb;
				790	unsigned char* outend = out + *outlen;
				791	unsigned short* in = (unsigned short*) inb;
				792	unsigned short* inend;
				793	unsigned int c, d, inlen;
				794	unsigned char *tmp;
				795	int bits;
				796
				797	if ((*inlenb % 2) == 1)
				798	(*inlenb)--;
				799	inlen = *inlenb / 2;
				800	inend = in + inlen;
				801	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				802	if (xmlLittleEndian) {
				803	c= *in++;
				804	} else {
				805	tmp = (unsigned char *) in;
				806	c = *tmp++;
				807	c = c \| (((unsigned int)*tmp) << 8);
				808	in++;
				809	}
				810	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				811	if (in >= inend) { /* (in > inend) shouldn't happens */
				812	break;
				813	}
				814	if (xmlLittleEndian) {
				815	d = *in++;
				816	} else {
				817	tmp = (unsigned char *) in;
				818	d = *tmp++;
				819	d = d \| (((unsigned int)*tmp) << 8);
				820	in++;
				821	}
				822	if ((d & 0xFC00) == 0xDC00) {
				823	c &= 0x03FF;
				824	c <<= 10;
				825	c \|= d & 0x03FF;
				826	c += 0x10000;
				827	}
				828	else {
				829	*outlen = out - outstart;
				830	*inlenb = processed - inb;
				831	return(-2);
				832	}
				833	}
				834
				835	/* assertion: c is a single UTF-4 value */
				836	if (out >= outend)
				837	break;
				838	if (c < 0x80) { *out++= c; bits= -6; }
				839	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				840	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				841	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				842
				843	for ( ; bits >= 0; bits-= 6) {
				844	if (out >= outend)
				845	break;
				846	*out++= ((c >> bits) & 0x3F) \| 0x80;
				847	}
				848	processed = (const unsigned char*) in;
				849	}
				850	*outlen = out - outstart;
				851	*inlenb = processed - inb;
				852	return(0);
				853	}
				854
				855	/**
				856	* UTF8ToUTF16LE:
				857	* @outb: a pointer to an array of bytes to store the result
				858	* @outlen: the length of @outb
				859	* @in: a pointer to an array of UTF-8 chars
				860	* @inlen: the length of @in
				861	*
				862	* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
				863	* block of chars out.
				864	*
				865	* Returns the number of byte written, or -1 by lack of space, or -2
				866	* if the transcoding failed.
				867	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	868	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	869	UTF8ToUTF16LE(unsigned char* outb, int *outlen,
				870	const unsigned char* in, int *inlen)
				871	{
				872	unsigned short* out = (unsigned short*) outb;
				873	const unsigned char* processed = in;
				874	unsigned short* outstart= out;
				875	unsigned short* outend;
				876	const unsigned char* inend= in+*inlen;
				877	unsigned int c, d;
				878	int trailing;
				879	unsigned char *tmp;
				880	unsigned short tmp1, tmp2;
				881
				882	if (in == NULL) {
				883	/*
				884	* initialization, add the Byte Order Mark
				885	*/
				886	if (*outlen >= 2) {
				887	outb[0] = 0xFF;
				888	outb[1] = 0xFE;
				889	*outlen = 2;
				890	*inlen = 0;
				891	#ifdef DEBUG_ENCODING
				892	xmlGenericError(xmlGenericErrorContext,
				893	"Added FFFE Byte Order Mark\n");
				894	#endif
				895	return(2);
				896	}
				897	*outlen = 0;
				898	*inlen = 0;
				899	return(0);
				900	}
				901	outend = out + (*outlen / 2);
				902	while (in < inend) {
				903	d= *in++;
				904	if (d < 0x80) { c= d; trailing= 0; }
				905	else if (d < 0xC0) {
				906	/* trailing byte in leading position */
				907	outlen = (out - outstart) 2;
				908	*inlen = processed - in;
				909	return(-2);
				910	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				911	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				912	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				913	else {
				914	/* no chance for this in UTF-16 */
				915	outlen = (out - outstart) 2;
				916	*inlen = processed - in;
				917	return(-2);
				918	}
				919
				920	if (inend - in < trailing) {
				921	break;
				922	}
				923
				924	for ( ; trailing; trailing--) {
				925	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				926	break;
				927	c <<= 6;
				928	c \|= d & 0x3F;
				929	}
				930
				931	/* assertion: c is a single UTF-4 value */
				932	if (c < 0x10000) {
				933	if (out >= outend)
				934	break;
				935	if (xmlLittleEndian) {
				936	*out++ = c;
				937	} else {
				938	tmp = (unsigned char *) out;
				939	*tmp = c ;
				940	*(tmp + 1) = c >> 8 ;
				941	out++;
				942	}
				943	}
				944	else if (c < 0x110000) {
				945	if (out+1 >= outend)
				946	break;
				947	c -= 0x10000;
				948	if (xmlLittleEndian) {
				949	*out++ = 0xD800 \| (c >> 10);
				950	*out++ = 0xDC00 \| (c & 0x03FF);
				951	} else {
				952	tmp1 = 0xD800 \| (c >> 10);
				953	tmp = (unsigned char *) out;
				954	*tmp = (unsigned char) tmp1;
				955	*(tmp + 1) = tmp1 >> 8;
				956	out++;
				957
				958	tmp2 = 0xDC00 \| (c & 0x03FF);
				959	tmp = (unsigned char *) out;
				960	*tmp = (unsigned char) tmp2;
				961	*(tmp + 1) = tmp2 >> 8;
				962	out++;
				963	}
				964	}
				965	else
				966	break;
				967	processed = in;
				968	}
				969	outlen = (out - outstart) 2;
				970	*inlen = processed - in;
				971	return(0);
				972	}
				973
				974	/**
				975	* UTF16BEToUTF8:
				976	* @out: a pointer to an array of bytes to store the result
				977	* @outlen: the length of @out
				978	* @inb: a pointer to an array of UTF-16 passwd as a byte array
				979	* @inlenb: the length of @in in UTF-16 chars
				980	*
				981	* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	982	* block of chars out. This function assume the endian property
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	983	* is the same between the native type of this machine and the
				984	* inputed one.
				985	*
				986	* Returns the number of byte written, or -1 by lack of space, or -2
				987	* if the transcoding fails (for *in is not valid utf16 string)
				988	* The value of *inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	989	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	990	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	991	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	992	UTF16BEToUTF8(unsigned char* out, int *outlen,
				993	const unsigned char* inb, int *inlenb)
				994	{
				995	unsigned char* outstart = out;
				996	const unsigned char* processed = inb;
				997	unsigned char* outend = out + *outlen;
				998	unsigned short* in = (unsigned short*) inb;
				999	unsigned short* inend;
				1000	unsigned int c, d, inlen;
				1001	unsigned char *tmp;
				1002	int bits;
				1003
				1004	if ((*inlenb % 2) == 1)
				1005	(*inlenb)--;
				1006	inlen = *inlenb / 2;
				1007	inend= in + inlen;
				1008	while (in < inend) {
				1009	if (xmlLittleEndian) {
				1010	tmp = (unsigned char *) in;
				1011	c = *tmp++;
				1012	c = c << 8;
				1013	c = c \| (unsigned int) *tmp;
				1014	in++;
				1015	} else {
				1016	c= *in++;
				1017	}
				1018	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				1019	if (in >= inend) { /* (in > inend) shouldn't happens */
				1020	*outlen = out - outstart;
				1021	*inlenb = processed - inb;
				1022	return(-2);
				1023	}
				1024	if (xmlLittleEndian) {
				1025	tmp = (unsigned char *) in;
				1026	d = *tmp++;
				1027	d = d << 8;
				1028	d = d \| (unsigned int) *tmp;
				1029	in++;
				1030	} else {
				1031	d= *in++;
				1032	}
				1033	if ((d & 0xFC00) == 0xDC00) {
				1034	c &= 0x03FF;
				1035	c <<= 10;
				1036	c \|= d & 0x03FF;
				1037	c += 0x10000;
				1038	}
				1039	else {
				1040	*outlen = out - outstart;
				1041	*inlenb = processed - inb;
				1042	return(-2);
				1043	}
				1044	}
				1045
				1046	/* assertion: c is a single UTF-4 value */
				1047	if (out >= outend)
				1048	break;
				1049	if (c < 0x80) { *out++= c; bits= -6; }
				1050	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				1051	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				1052	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				1053
				1054	for ( ; bits >= 0; bits-= 6) {
				1055	if (out >= outend)
				1056	break;
				1057	*out++= ((c >> bits) & 0x3F) \| 0x80;
				1058	}
				1059	processed = (const unsigned char*) in;
				1060	}
				1061	*outlen = out - outstart;
				1062	*inlenb = processed - inb;
				1063	return(0);
				1064	}
				1065
				1066	/**
				1067	* UTF8ToUTF16BE:
				1068	* @outb: a pointer to an array of bytes to store the result
				1069	* @outlen: the length of @outb
				1070	* @in: a pointer to an array of UTF-8 chars
				1071	* @inlen: the length of @in
				1072	*
				1073	* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
				1074	* block of chars out.
				1075	*
				1076	* Returns the number of byte written, or -1 by lack of space, or -2
				1077	* if the transcoding failed.
				1078	*/
Daniel Veillard	56a4cb8	2001-03-24 17:00:36 +0000	[diff] [blame]	1079	static int
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1080	UTF8ToUTF16BE(unsigned char* outb, int *outlen,
				1081	const unsigned char* in, int *inlen)
				1082	{
				1083	unsigned short* out = (unsigned short*) outb;
				1084	const unsigned char* processed = in;
				1085	unsigned short* outstart= out;
				1086	unsigned short* outend;
				1087	const unsigned char* inend= in+*inlen;
				1088	unsigned int c, d;
				1089	int trailing;
				1090	unsigned char *tmp;
				1091	unsigned short tmp1, tmp2;
				1092
				1093	if (in == NULL) {
				1094	/*
				1095	* initialization, add the Byte Order Mark
				1096	*/
				1097	if (*outlen >= 2) {
				1098	outb[0] = 0xFE;
				1099	outb[1] = 0xFF;
				1100	*outlen = 2;
				1101	*inlen = 0;
				1102	#ifdef DEBUG_ENCODING
				1103	xmlGenericError(xmlGenericErrorContext,
				1104	"Added FEFF Byte Order Mark\n");
				1105	#endif
				1106	return(2);
				1107	}
				1108	*outlen = 0;
				1109	*inlen = 0;
				1110	return(0);
				1111	}
				1112	outend = out + (*outlen / 2);
				1113	while (in < inend) {
				1114	d= *in++;
				1115	if (d < 0x80) { c= d; trailing= 0; }
				1116	else if (d < 0xC0) {
				1117	/* trailing byte in leading position */
				1118	*outlen = out - outstart;
				1119	*inlen = processed - in;
				1120	return(-2);
				1121	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				1122	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				1123	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				1124	else {
				1125	/* no chance for this in UTF-16 */
				1126	*outlen = out - outstart;
				1127	*inlen = processed - in;
				1128	return(-2);
				1129	}
				1130
				1131	if (inend - in < trailing) {
				1132	break;
				1133	}
				1134
				1135	for ( ; trailing; trailing--) {
				1136	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80)) break;
				1137	c <<= 6;
				1138	c \|= d & 0x3F;
				1139	}
				1140
				1141	/* assertion: c is a single UTF-4 value */
				1142	if (c < 0x10000) {
				1143	if (out >= outend) break;
				1144	if (xmlLittleEndian) {
				1145	tmp = (unsigned char *) out;
				1146	*tmp = c >> 8;
				1147	*(tmp + 1) = c;
				1148	out++;
				1149	} else {
				1150	*out++ = c;
				1151	}
				1152	}
				1153	else if (c < 0x110000) {
				1154	if (out+1 >= outend) break;
				1155	c -= 0x10000;
				1156	if (xmlLittleEndian) {
				1157	tmp1 = 0xD800 \| (c >> 10);
				1158	tmp = (unsigned char *) out;
				1159	*tmp = tmp1 >> 8;
				1160	*(tmp + 1) = (unsigned char) tmp1;
				1161	out++;
				1162
				1163	tmp2 = 0xDC00 \| (c & 0x03FF);
				1164	tmp = (unsigned char *) out;
				1165	*tmp = tmp2 >> 8;
				1166	*(tmp + 1) = (unsigned char) tmp2;
				1167	out++;
				1168	} else {
				1169	*out++ = 0xD800 \| (c >> 10);
				1170	*out++ = 0xDC00 \| (c & 0x03FF);
				1171	}
				1172	}
				1173	else
				1174	break;
				1175	processed = in;
				1176	}
				1177	outlen = (out - outstart) 2;
				1178	*inlen = processed - in;
				1179	return(0);
				1180	}
				1181
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1182	/************************************************************************
				1183	* *
				1184	* Generic encoding handling routines *
				1185	* *
				1186	************************************************************************/
				1187
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1188	/**
				1189	* xmlDetectCharEncoding:
				1190	* @in: a pointer to the first bytes of the XML entity, must be at least
				1191	* 4 bytes long.
				1192	* @len: pointer to the length of the buffer
				1193	*
				1194	* Guess the encoding of the entity using the first bytes of the entity content
				1195	* accordingly of the non-normative appendix F of the XML-1.0 recommendation.
				1196	*
				1197	* Returns one of the XML_CHAR_ENCODING_... values.
				1198	*/
				1199	xmlCharEncoding
				1200	xmlDetectCharEncoding(const unsigned char* in, int len)
				1201	{
				1202	if (len >= 4) {
				1203	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				1204	(in[2] == 0x00) && (in[3] == 0x3C))
				1205	return(XML_CHAR_ENCODING_UCS4BE);
				1206	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
				1207	(in[2] == 0x00) && (in[3] == 0x00))
				1208	return(XML_CHAR_ENCODING_UCS4LE);
				1209	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				1210	(in[2] == 0x3C) && (in[3] == 0x00))
				1211	return(XML_CHAR_ENCODING_UCS4_2143);
				1212	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
				1213	(in[2] == 0x00) && (in[3] == 0x00))
				1214	return(XML_CHAR_ENCODING_UCS4_3412);
				1215	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
				1216	(in[2] == 0xA7) && (in[3] == 0x94))
				1217	return(XML_CHAR_ENCODING_EBCDIC);
				1218	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
				1219	(in[2] == 0x78) && (in[3] == 0x6D))
				1220	return(XML_CHAR_ENCODING_UTF8);
				1221	}
Daniel Veillard	87a764e	2001-06-20 17:41:10 +0000	[diff] [blame]	1222	if (len >= 3) {
				1223	/*
				1224	* Errata on XML-1.0 June 20 2001
				1225	* We now allow an UTF8 encoded BOM
				1226	*/
				1227	if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
				1228	(in[2] == 0xBF))
				1229	return(XML_CHAR_ENCODING_UTF8);
				1230	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1231	if (len >= 2) {
				1232	if ((in[0] == 0xFE) && (in[1] == 0xFF))
				1233	return(XML_CHAR_ENCODING_UTF16BE);
				1234	if ((in[0] == 0xFF) && (in[1] == 0xFE))
				1235	return(XML_CHAR_ENCODING_UTF16LE);
				1236	}
				1237	return(XML_CHAR_ENCODING_NONE);
				1238	}
				1239
				1240	/**
				1241	* xmlCleanupEncodingAliases:
				1242	*
				1243	* Unregisters all aliases
				1244	*/
				1245	void
				1246	xmlCleanupEncodingAliases(void) {
				1247	int i;
				1248
				1249	if (xmlCharEncodingAliases == NULL)
				1250	return;
				1251
				1252	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1253	if (xmlCharEncodingAliases[i].name != NULL)
				1254	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1255	if (xmlCharEncodingAliases[i].alias != NULL)
				1256	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				1257	}
				1258	xmlCharEncodingAliasesNb = 0;
				1259	xmlCharEncodingAliasesMax = 0;
				1260	xmlFree(xmlCharEncodingAliases);
Daniel Veillard	73c6e53	2002-01-08 13:15:33 +0000	[diff] [blame]	1261	xmlCharEncodingAliases = NULL;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1262	}
				1263
				1264	/**
				1265	* xmlGetEncodingAlias:
				1266	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1267	*
				1268	* Lookup an encoding name for the given alias.
				1269	*
				1270	* Returns NULL if not found the original name otherwise
				1271	*/
				1272	const char *
				1273	xmlGetEncodingAlias(const char *alias) {
				1274	int i;
				1275	char upper[100];
				1276
				1277	if (alias == NULL)
				1278	return(NULL);
				1279
				1280	if (xmlCharEncodingAliases == NULL)
				1281	return(NULL);
				1282
				1283	for (i = 0;i < 99;i++) {
				1284	upper[i] = toupper(alias[i]);
				1285	if (upper[i] == 0) break;
				1286	}
				1287	upper[i] = 0;
				1288
				1289	/*
				1290	* Walk down the list looking for a definition of the alias
				1291	*/
				1292	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1293	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1294	return(xmlCharEncodingAliases[i].name);
				1295	}
				1296	}
				1297	return(NULL);
				1298	}
				1299
				1300	/**
				1301	* xmlAddEncodingAlias:
				1302	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1303	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1304	*
				1305	* Registers and alias @alias for an encoding named @name. Existing alias
				1306	* will be overwritten.
				1307	*
				1308	* Returns 0 in case of success, -1 in case of error
				1309	*/
				1310	int
				1311	xmlAddEncodingAlias(const char name, const char alias) {
				1312	int i;
				1313	char upper[100];
				1314
				1315	if ((name == NULL) \|\| (alias == NULL))
				1316	return(-1);
				1317
				1318	for (i = 0;i < 99;i++) {
				1319	upper[i] = toupper(alias[i]);
				1320	if (upper[i] == 0) break;
				1321	}
				1322	upper[i] = 0;
				1323
				1324	if (xmlCharEncodingAliases == NULL) {
				1325	xmlCharEncodingAliasesNb = 0;
				1326	xmlCharEncodingAliasesMax = 20;
				1327	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1328	xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1329	if (xmlCharEncodingAliases == NULL)
				1330	return(-1);
				1331	} else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
				1332	xmlCharEncodingAliasesMax *= 2;
				1333	xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
				1334	xmlRealloc(xmlCharEncodingAliases,
				1335	xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
				1336	}
				1337	/*
				1338	* Walk down the list looking for a definition of the alias
				1339	*/
				1340	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1341	if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
				1342	/*
				1343	* Replace the definition.
				1344	*/
				1345	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1346	xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
				1347	return(0);
				1348	}
				1349	}
				1350	/*
				1351	* Add the definition
				1352	*/
				1353	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
				1354	xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
				1355	xmlCharEncodingAliasesNb++;
				1356	return(0);
				1357	}
				1358
				1359	/**
				1360	* xmlDelEncodingAlias:
				1361	* @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
				1362	*
				1363	* Unregisters an encoding alias @alias
				1364	*
				1365	* Returns 0 in case of success, -1 in case of error
				1366	*/
				1367	int
				1368	xmlDelEncodingAlias(const char *alias) {
				1369	int i;
				1370
				1371	if (alias == NULL)
				1372	return(-1);
				1373
				1374	if (xmlCharEncodingAliases == NULL)
				1375	return(-1);
				1376	/*
				1377	* Walk down the list looking for a definition of the alias
				1378	*/
				1379	for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
				1380	if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
				1381	xmlFree((char *) xmlCharEncodingAliases[i].name);
				1382	xmlFree((char *) xmlCharEncodingAliases[i].alias);
				1383	xmlCharEncodingAliasesNb--;
				1384	memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
				1385	sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
				1386	return(0);
				1387	}
				1388	}
				1389	return(-1);
				1390	}
				1391
				1392	/**
				1393	* xmlParseCharEncoding:
				1394	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
				1395	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1396	* Compare the string to the known encoding schemes already known. Note
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1397	* that the comparison is case insensitive accordingly to the section
				1398	* [XML] 4.3.3 Character Encoding in Entities.
				1399	*
				1400	* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
				1401	* if not recognized.
				1402	*/
				1403	xmlCharEncoding
				1404	xmlParseCharEncoding(const char* name)
				1405	{
				1406	const char *alias;
				1407	char upper[500];
				1408	int i;
				1409
				1410	if (name == NULL)
				1411	return(XML_CHAR_ENCODING_NONE);
				1412
				1413	/*
				1414	* Do the alias resolution
				1415	*/
				1416	alias = xmlGetEncodingAlias(name);
				1417	if (alias != NULL)
				1418	name = alias;
				1419
				1420	for (i = 0;i < 499;i++) {
				1421	upper[i] = toupper(name[i]);
				1422	if (upper[i] == 0) break;
				1423	}
				1424	upper[i] = 0;
				1425
				1426	if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
				1427	if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
				1428	if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
				1429
				1430	/*
				1431	* NOTE: if we were able to parse this, the endianness of UTF16 is
				1432	* already found and in use
				1433	*/
				1434	if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
				1435	if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
				1436
				1437	if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1438	if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				1439	if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
				1440
				1441	/*
				1442	* NOTE: if we were able to parse this, the endianness of UCS4 is
				1443	* already found and in use
				1444	*/
				1445	if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1446	if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				1447	if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
				1448
				1449
				1450	if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
				1451	if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
				1452	if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
				1453
				1454	if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
				1455	if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
				1456	if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
				1457
				1458	if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
				1459	if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
				1460	if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
				1461	if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
				1462	if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
				1463	if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
				1464	if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
				1465
				1466	if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
				1467	if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
				1468	if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
				1469
				1470	#ifdef DEBUG_ENCODING
				1471	xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
				1472	#endif
				1473	return(XML_CHAR_ENCODING_ERROR);
				1474	}
				1475
				1476	/**
				1477	* xmlGetCharEncodingName:
				1478	* @enc: the encoding
				1479	*
				1480	* The "canonical" name for XML encoding.
				1481	* C.f. http://www.w3.org/TR/REC-xml#charencoding
				1482	* Section 4.3.3 Character Encoding in Entities
				1483	*
				1484	* Returns the canonical name for the given encoding
				1485	*/
				1486
				1487	const char*
				1488	xmlGetCharEncodingName(xmlCharEncoding enc) {
				1489	switch (enc) {
				1490	case XML_CHAR_ENCODING_ERROR:
				1491	return(NULL);
				1492	case XML_CHAR_ENCODING_NONE:
				1493	return(NULL);
				1494	case XML_CHAR_ENCODING_UTF8:
				1495	return("UTF-8");
				1496	case XML_CHAR_ENCODING_UTF16LE:
				1497	return("UTF-16");
				1498	case XML_CHAR_ENCODING_UTF16BE:
				1499	return("UTF-16");
				1500	case XML_CHAR_ENCODING_EBCDIC:
				1501	return("EBCDIC");
				1502	case XML_CHAR_ENCODING_UCS4LE:
				1503	return("ISO-10646-UCS-4");
				1504	case XML_CHAR_ENCODING_UCS4BE:
				1505	return("ISO-10646-UCS-4");
				1506	case XML_CHAR_ENCODING_UCS4_2143:
				1507	return("ISO-10646-UCS-4");
				1508	case XML_CHAR_ENCODING_UCS4_3412:
				1509	return("ISO-10646-UCS-4");
				1510	case XML_CHAR_ENCODING_UCS2:
				1511	return("ISO-10646-UCS-2");
				1512	case XML_CHAR_ENCODING_8859_1:
				1513	return("ISO-8859-1");
				1514	case XML_CHAR_ENCODING_8859_2:
				1515	return("ISO-8859-2");
				1516	case XML_CHAR_ENCODING_8859_3:
				1517	return("ISO-8859-3");
				1518	case XML_CHAR_ENCODING_8859_4:
				1519	return("ISO-8859-4");
				1520	case XML_CHAR_ENCODING_8859_5:
				1521	return("ISO-8859-5");
				1522	case XML_CHAR_ENCODING_8859_6:
				1523	return("ISO-8859-6");
				1524	case XML_CHAR_ENCODING_8859_7:
				1525	return("ISO-8859-7");
				1526	case XML_CHAR_ENCODING_8859_8:
				1527	return("ISO-8859-8");
				1528	case XML_CHAR_ENCODING_8859_9:
				1529	return("ISO-8859-9");
				1530	case XML_CHAR_ENCODING_2022_JP:
				1531	return("ISO-2022-JP");
				1532	case XML_CHAR_ENCODING_SHIFT_JIS:
				1533	return("Shift-JIS");
				1534	case XML_CHAR_ENCODING_EUC_JP:
				1535	return("EUC-JP");
				1536	case XML_CHAR_ENCODING_ASCII:
				1537	return(NULL);
				1538	}
				1539	return(NULL);
				1540	}
				1541
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1542	/************************************************************************
				1543	* *
				1544	* Char encoding handlers *
				1545	* *
				1546	************************************************************************/
				1547
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1548
				1549	/* the size should be growable, but it's not a big deal ... */
				1550	#define MAX_ENCODING_HANDLERS 50
				1551	static xmlCharEncodingHandlerPtr *handlers = NULL;
				1552	static int nbCharEncodingHandler = 0;
				1553
				1554	/*
				1555	* The default is UTF-8 for XML, that's also the default used for the
				1556	* parser internals, so the default encoding handler is NULL
				1557	*/
				1558
				1559	static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
				1560
				1561	/**
				1562	* xmlNewCharEncodingHandler:
				1563	* @name: the encoding name, in UTF-8 format (ASCII actually)
				1564	* @input: the xmlCharEncodingInputFunc to read that encoding
				1565	* @output: the xmlCharEncodingOutputFunc to write that encoding
				1566	*
				1567	* Create and registers an xmlCharEncodingHandler.
Daniel Veillard	6f46f6c	2002-08-01 12:22:24 +0000	[diff] [blame]	1568	*
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1569	* Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
				1570	*/
Daniel Veillard	6f46f6c	2002-08-01 12:22:24 +0000	[diff] [blame]	1571	xmlCharEncodingHandlerPtr
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1572	xmlNewCharEncodingHandler(const char *name,
				1573	xmlCharEncodingInputFunc input,
				1574	xmlCharEncodingOutputFunc output) {
				1575	xmlCharEncodingHandlerPtr handler;
				1576	const char *alias;
				1577	char upper[500];
				1578	int i;
				1579	char *up = 0;
				1580
				1581	/*
				1582	* Do the alias resolution
				1583	*/
				1584	alias = xmlGetEncodingAlias(name);
				1585	if (alias != NULL)
				1586	name = alias;
				1587
				1588	/*
				1589	* Keep only the uppercase version of the encoding.
				1590	*/
				1591	if (name == NULL) {
				1592	xmlGenericError(xmlGenericErrorContext,
				1593	"xmlNewCharEncodingHandler : no name !\n");
				1594	return(NULL);
				1595	}
				1596	for (i = 0;i < 499;i++) {
				1597	upper[i] = toupper(name[i]);
				1598	if (upper[i] == 0) break;
				1599	}
				1600	upper[i] = 0;
				1601	up = xmlMemStrdup(upper);
				1602	if (up == NULL) {
				1603	xmlGenericError(xmlGenericErrorContext,
				1604	"xmlNewCharEncodingHandler : out of memory !\n");
				1605	return(NULL);
				1606	}
				1607
				1608	/*
				1609	* allocate and fill-up an handler block.
				1610	*/
				1611	handler = (xmlCharEncodingHandlerPtr)
				1612	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1613	if (handler == NULL) {
				1614	xmlGenericError(xmlGenericErrorContext,
				1615	"xmlNewCharEncodingHandler : out of memory !\n");
				1616	return(NULL);
				1617	}
				1618	handler->input = input;
				1619	handler->output = output;
				1620	handler->name = up;
				1621
				1622	#ifdef LIBXML_ICONV_ENABLED
				1623	handler->iconv_in = NULL;
				1624	handler->iconv_out = NULL;
				1625	#endif /* LIBXML_ICONV_ENABLED */
				1626
				1627	/*
				1628	* registers and returns the handler.
				1629	*/
				1630	xmlRegisterCharEncodingHandler(handler);
				1631	#ifdef DEBUG_ENCODING
				1632	xmlGenericError(xmlGenericErrorContext,
				1633	"Registered encoding handler for %s\n", name);
				1634	#endif
				1635	return(handler);
				1636	}
				1637
				1638	/**
				1639	* xmlInitCharEncodingHandlers:
				1640	*
				1641	* Initialize the char encoding support, it registers the default
				1642	* encoding supported.
				1643	* NOTE: while public, this function usually doesn't need to be called
				1644	* in normal processing.
				1645	*/
				1646	void
				1647	xmlInitCharEncodingHandlers(void) {
				1648	unsigned short int tst = 0x1234;
				1649	unsigned char ptr = (unsigned char ) &tst;
				1650
				1651	if (handlers != NULL) return;
				1652
				1653	handlers = (xmlCharEncodingHandlerPtr *)
				1654	xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
				1655
				1656	if (*ptr == 0x12) xmlLittleEndian = 0;
				1657	else if (*ptr == 0x34) xmlLittleEndian = 1;
				1658	else xmlGenericError(xmlGenericErrorContext,
				1659	"Odd problem at endianness detection\n");
				1660
				1661	if (handlers == NULL) {
				1662	xmlGenericError(xmlGenericErrorContext,
				1663	"xmlInitCharEncodingHandlers : out of memory !\n");
				1664	return;
				1665	}
Daniel Veillard	81601f9	2003-01-14 13:42:37 +0000	[diff] [blame]	1666	xmlNewCharEncodingHandler("UTF-8", UTF8ToUTF8, UTF8ToUTF8);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1667	xmlUTF16LEHandler =
				1668	xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
				1669	xmlUTF16BEHandler =
				1670	xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
				1671	xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
				1672	xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard	2004242	2001-05-31 18:22:04 +0000	[diff] [blame]	1673	xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1674	#ifdef LIBXML_HTML_ENABLED
				1675	xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
				1676	#endif
				1677	}
				1678
				1679	/**
				1680	* xmlCleanupCharEncodingHandlers:
				1681	*
				1682	* Cleanup the memory allocated for the char encoding support, it
				1683	* unregisters all the encoding handlers and the aliases.
				1684	*/
				1685	void
				1686	xmlCleanupCharEncodingHandlers(void) {
				1687	xmlCleanupEncodingAliases();
				1688
				1689	if (handlers == NULL) return;
				1690
				1691	for (;nbCharEncodingHandler > 0;) {
				1692	nbCharEncodingHandler--;
				1693	if (handlers[nbCharEncodingHandler] != NULL) {
				1694	if (handlers[nbCharEncodingHandler]->name != NULL)
				1695	xmlFree(handlers[nbCharEncodingHandler]->name);
				1696	xmlFree(handlers[nbCharEncodingHandler]);
				1697	}
				1698	}
				1699	xmlFree(handlers);
				1700	handlers = NULL;
				1701	nbCharEncodingHandler = 0;
				1702	xmlDefaultCharEncodingHandler = NULL;
				1703	}
				1704
				1705	/**
				1706	* xmlRegisterCharEncodingHandler:
				1707	* @handler: the xmlCharEncodingHandlerPtr handler block
				1708	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1709	* Register the char encoding handler, surprising, isn't it ?
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1710	*/
				1711	void
				1712	xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
				1713	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1714	if (handler == NULL) {
				1715	xmlGenericError(xmlGenericErrorContext,
				1716	"xmlRegisterCharEncodingHandler: NULL handler !\n");
				1717	return;
				1718	}
				1719
				1720	if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
				1721	xmlGenericError(xmlGenericErrorContext,
				1722	"xmlRegisterCharEncodingHandler: Too many handler registered\n");
				1723	xmlGenericError(xmlGenericErrorContext,
				1724	"\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
				1725	return;
				1726	}
				1727	handlers[nbCharEncodingHandler++] = handler;
				1728	}
				1729
				1730	/**
				1731	* xmlGetCharEncodingHandler:
				1732	* @enc: an xmlCharEncoding value.
				1733	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1734	* Search in the registered set the handler able to read/write that encoding.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1735	*
				1736	* Returns the handler or NULL if not found
				1737	*/
				1738	xmlCharEncodingHandlerPtr
				1739	xmlGetCharEncodingHandler(xmlCharEncoding enc) {
				1740	xmlCharEncodingHandlerPtr handler;
				1741
				1742	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1743	switch (enc) {
				1744	case XML_CHAR_ENCODING_ERROR:
				1745	return(NULL);
				1746	case XML_CHAR_ENCODING_NONE:
				1747	return(NULL);
				1748	case XML_CHAR_ENCODING_UTF8:
				1749	return(NULL);
				1750	case XML_CHAR_ENCODING_UTF16LE:
				1751	return(xmlUTF16LEHandler);
				1752	case XML_CHAR_ENCODING_UTF16BE:
				1753	return(xmlUTF16BEHandler);
				1754	case XML_CHAR_ENCODING_EBCDIC:
				1755	handler = xmlFindCharEncodingHandler("EBCDIC");
				1756	if (handler != NULL) return(handler);
				1757	handler = xmlFindCharEncodingHandler("ebcdic");
				1758	if (handler != NULL) return(handler);
				1759	break;
				1760	case XML_CHAR_ENCODING_UCS4BE:
				1761	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1762	if (handler != NULL) return(handler);
				1763	handler = xmlFindCharEncodingHandler("UCS-4");
				1764	if (handler != NULL) return(handler);
				1765	handler = xmlFindCharEncodingHandler("UCS4");
				1766	if (handler != NULL) return(handler);
				1767	break;
				1768	case XML_CHAR_ENCODING_UCS4LE:
				1769	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1770	if (handler != NULL) return(handler);
				1771	handler = xmlFindCharEncodingHandler("UCS-4");
				1772	if (handler != NULL) return(handler);
				1773	handler = xmlFindCharEncodingHandler("UCS4");
				1774	if (handler != NULL) return(handler);
				1775	break;
				1776	case XML_CHAR_ENCODING_UCS4_2143:
				1777	break;
				1778	case XML_CHAR_ENCODING_UCS4_3412:
				1779	break;
				1780	case XML_CHAR_ENCODING_UCS2:
				1781	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
				1782	if (handler != NULL) return(handler);
				1783	handler = xmlFindCharEncodingHandler("UCS-2");
				1784	if (handler != NULL) return(handler);
				1785	handler = xmlFindCharEncodingHandler("UCS2");
				1786	if (handler != NULL) return(handler);
				1787	break;
				1788
				1789	/*
				1790	* We used to keep ISO Latin encodings native in the
				1791	* generated data. This led to so many problems that
				1792	* this has been removed. One can still change this
				1793	* back by registering no-ops encoders for those
				1794	*/
				1795	case XML_CHAR_ENCODING_8859_1:
				1796	handler = xmlFindCharEncodingHandler("ISO-8859-1");
				1797	if (handler != NULL) return(handler);
				1798	break;
				1799	case XML_CHAR_ENCODING_8859_2:
				1800	handler = xmlFindCharEncodingHandler("ISO-8859-2");
				1801	if (handler != NULL) return(handler);
				1802	break;
				1803	case XML_CHAR_ENCODING_8859_3:
				1804	handler = xmlFindCharEncodingHandler("ISO-8859-3");
				1805	if (handler != NULL) return(handler);
				1806	break;
				1807	case XML_CHAR_ENCODING_8859_4:
				1808	handler = xmlFindCharEncodingHandler("ISO-8859-4");
				1809	if (handler != NULL) return(handler);
				1810	break;
				1811	case XML_CHAR_ENCODING_8859_5:
				1812	handler = xmlFindCharEncodingHandler("ISO-8859-5");
				1813	if (handler != NULL) return(handler);
				1814	break;
				1815	case XML_CHAR_ENCODING_8859_6:
				1816	handler = xmlFindCharEncodingHandler("ISO-8859-6");
				1817	if (handler != NULL) return(handler);
				1818	break;
				1819	case XML_CHAR_ENCODING_8859_7:
				1820	handler = xmlFindCharEncodingHandler("ISO-8859-7");
				1821	if (handler != NULL) return(handler);
				1822	break;
				1823	case XML_CHAR_ENCODING_8859_8:
				1824	handler = xmlFindCharEncodingHandler("ISO-8859-8");
				1825	if (handler != NULL) return(handler);
				1826	break;
				1827	case XML_CHAR_ENCODING_8859_9:
				1828	handler = xmlFindCharEncodingHandler("ISO-8859-9");
				1829	if (handler != NULL) return(handler);
				1830	break;
				1831
				1832
				1833	case XML_CHAR_ENCODING_2022_JP:
				1834	handler = xmlFindCharEncodingHandler("ISO-2022-JP");
				1835	if (handler != NULL) return(handler);
				1836	break;
				1837	case XML_CHAR_ENCODING_SHIFT_JIS:
				1838	handler = xmlFindCharEncodingHandler("SHIFT-JIS");
				1839	if (handler != NULL) return(handler);
				1840	handler = xmlFindCharEncodingHandler("SHIFT_JIS");
				1841	if (handler != NULL) return(handler);
				1842	handler = xmlFindCharEncodingHandler("Shift_JIS");
				1843	if (handler != NULL) return(handler);
				1844	break;
				1845	case XML_CHAR_ENCODING_EUC_JP:
				1846	handler = xmlFindCharEncodingHandler("EUC-JP");
				1847	if (handler != NULL) return(handler);
				1848	break;
				1849	default:
				1850	break;
				1851	}
				1852
				1853	#ifdef DEBUG_ENCODING
				1854	xmlGenericError(xmlGenericErrorContext,
				1855	"No handler found for encoding %d\n", enc);
				1856	#endif
				1857	return(NULL);
				1858	}
				1859
				1860	/**
Daniel Veillard	5e2dace	2001-07-18 19:30:27 +0000	[diff] [blame]	1861	* xmlFindCharEncodingHandler:
				1862	* @name: a string describing the char encoding.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1863	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1864	* Search in the registered set the handler able to read/write that encoding.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1865	*
				1866	* Returns the handler or NULL if not found
				1867	*/
				1868	xmlCharEncodingHandlerPtr
				1869	xmlFindCharEncodingHandler(const char *name) {
				1870	const char *nalias;
				1871	const char *norig;
				1872	xmlCharEncoding alias;
				1873	#ifdef LIBXML_ICONV_ENABLED
				1874	xmlCharEncodingHandlerPtr enc;
				1875	iconv_t icv_in, icv_out;
				1876	#endif /* LIBXML_ICONV_ENABLED */
				1877	char upper[100];
				1878	int i;
				1879
				1880	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1881	if (name == NULL) return(xmlDefaultCharEncodingHandler);
				1882	if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
				1883
				1884	/*
				1885	* Do the alias resolution
				1886	*/
				1887	norig = name;
				1888	nalias = xmlGetEncodingAlias(name);
				1889	if (nalias != NULL)
				1890	name = nalias;
				1891
				1892	/*
				1893	* Check first for directly registered encoding names
				1894	*/
				1895	for (i = 0;i < 99;i++) {
				1896	upper[i] = toupper(name[i]);
				1897	if (upper[i] == 0) break;
				1898	}
				1899	upper[i] = 0;
				1900
				1901	for (i = 0;i < nbCharEncodingHandler; i++)
				1902	if (!strcmp(upper, handlers[i]->name)) {
				1903	#ifdef DEBUG_ENCODING
				1904	xmlGenericError(xmlGenericErrorContext,
				1905	"Found registered handler for encoding %s\n", name);
				1906	#endif
				1907	return(handlers[i]);
				1908	}
				1909
				1910	#ifdef LIBXML_ICONV_ENABLED
				1911	/* check whether iconv can handle this */
				1912	icv_in = iconv_open("UTF-8", name);
				1913	icv_out = iconv_open(name, "UTF-8");
				1914	if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
				1915	enc = (xmlCharEncodingHandlerPtr)
				1916	xmlMalloc(sizeof(xmlCharEncodingHandler));
				1917	if (enc == NULL) {
				1918	iconv_close(icv_in);
				1919	iconv_close(icv_out);
				1920	return(NULL);
				1921	}
				1922	enc->name = xmlMemStrdup(name);
				1923	enc->input = NULL;
				1924	enc->output = NULL;
				1925	enc->iconv_in = icv_in;
				1926	enc->iconv_out = icv_out;
				1927	#ifdef DEBUG_ENCODING
				1928	xmlGenericError(xmlGenericErrorContext,
				1929	"Found iconv handler for encoding %s\n", name);
				1930	#endif
				1931	return enc;
				1932	} else if ((icv_in != (iconv_t) -1) \|\| icv_out != (iconv_t) -1) {
				1933	xmlGenericError(xmlGenericErrorContext,
				1934	"iconv : problems with filters for '%s'\n", name);
				1935	}
				1936	#endif /* LIBXML_ICONV_ENABLED */
				1937
				1938	#ifdef DEBUG_ENCODING
				1939	xmlGenericError(xmlGenericErrorContext,
				1940	"No handler found for encoding %s\n", name);
				1941	#endif
				1942
				1943	/*
				1944	* Fallback using the canonical names
				1945	*/
				1946	alias = xmlParseCharEncoding(norig);
				1947	if (alias != XML_CHAR_ENCODING_ERROR) {
				1948	const char* canon;
				1949	canon = xmlGetCharEncodingName(alias);
				1950	if ((canon != NULL) && (strcmp(name, canon))) {
				1951	return(xmlFindCharEncodingHandler(canon));
				1952	}
				1953	}
				1954
				1955	return(NULL);
				1956	}
				1957
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	1958	/************************************************************************
				1959	* *
				1960	* ICONV based generic conversion functions *
				1961	* *
				1962	************************************************************************/
				1963
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1964	#ifdef LIBXML_ICONV_ENABLED
				1965	/**
				1966	* xmlIconvWrapper:
				1967	* @cd: iconv converter data structure
				1968	* @out: a pointer to an array of bytes to store the result
				1969	* @outlen: the length of @out
				1970	* @in: a pointer to an array of ISO Latin 1 chars
				1971	* @inlen: the length of @in
				1972	*
				1973	* Returns 0 if success, or
				1974	* -1 by lack of space, or
				1975	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1976	* the result of transformation can't fit into the encoding we want), or
				1977	* -3 if there the last byte can't form a single output char.
				1978	*
				1979	* The value of @inlen after return is the number of octets consumed
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	1980	* as the return value is positive, else unpredictable.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1981	* The value of @outlen after return is the number of ocetes consumed.
				1982	*/
				1983	static int
				1984	xmlIconvWrapper(iconv_t cd,
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1985	unsigned char out, int outlen,
				1986	const unsigned char in, int inlen) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1987
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1988	size_t icv_inlen = inlen, icv_outlen = outlen;
				1989	const char icv_in = (const char ) in;
				1990	char icv_out = (char ) out;
				1991	int ret;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	1992
Darin Adler	699613b	2001-07-27 22:47:14 +0000	[diff] [blame]	1993	ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	1994	if (in != NULL) {
				1995	*inlen -= icv_inlen;
				1996	*outlen -= icv_outlen;
				1997	} else {
				1998	*inlen = 0;
				1999	*outlen = 0;
				2000	}
				2001	if ((icv_inlen != 0) \|\| (ret == -1)) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2002	#ifdef EILSEQ
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	2003	if (errno == EILSEQ) {
				2004	return -2;
				2005	} else
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2006	#endif
				2007	#ifdef E2BIG
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	2008	if (errno == E2BIG) {
				2009	return -1;
				2010	} else
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2011	#endif
				2012	#ifdef EINVAL
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	2013	if (errno == EINVAL) {
				2014	return -3;
				2015	} else
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2016	#endif
Daniel Veillard	9403a04	2001-05-28 11:00:53 +0000	[diff] [blame]	2017	{
				2018	return -3;
				2019	}
				2020	}
				2021	return 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2022	}
				2023	#endif /* LIBXML_ICONV_ENABLED */
				2024
Daniel Veillard	97ac131	2001-05-30 19:14:17 +0000	[diff] [blame]	2025	/************************************************************************
				2026	* *
				2027	* The real API used by libxml for on-the-fly conversion *
				2028	* *
				2029	************************************************************************/
				2030
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2031	/**
				2032	* xmlCharEncFirstLine:
				2033	* @handler: char enconding transformation data structure
				2034	* @out: an xmlBuffer for the output.
				2035	* @in: an xmlBuffer for the input
				2036	*
				2037	* Front-end for the encoding handler input function, but handle only
				2038	* the very first line, i.e. limit itself to 45 chars.
				2039	*
				2040	* Returns the number of byte written if success, or
				2041	* -1 general error
				2042	* -2 if the transcoding fails (for *in is not valid utf8 string or
				2043	* the result of transformation can't fit into the encoding we want), or
				2044	*/
				2045	int
				2046	xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				2047	xmlBufferPtr in) {
				2048	int ret = -2;
				2049	int written;
				2050	int toconv;
				2051
				2052	if (handler == NULL) return(-1);
				2053	if (out == NULL) return(-1);
				2054	if (in == NULL) return(-1);
				2055
				2056	written = out->size - out->use;
				2057	toconv = in->use;
				2058	if (toconv * 2 >= written) {
				2059	xmlBufferGrow(out, toconv);
				2060	written = out->size - out->use - 1;
				2061	}
				2062
				2063	/*
				2064	* echo '<?xml version="1.0" encoding="UCS4"?>' \| wc -c => 38
				2065	* 45 chars should be sufficient to reach the end of the encoding
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2066	* declaration without going too far inside the document content.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2067	*/
				2068	written = 45;
				2069
				2070	if (handler->input != NULL) {
				2071	ret = handler->input(&out->content[out->use], &written,
				2072	in->content, &toconv);
				2073	xmlBufferShrink(in, toconv);
				2074	out->use += written;
				2075	out->content[out->use] = 0;
				2076	}
				2077	#ifdef LIBXML_ICONV_ENABLED
				2078	else if (handler->iconv_in != NULL) {
				2079	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				2080	&written, in->content, &toconv);
				2081	xmlBufferShrink(in, toconv);
				2082	out->use += written;
				2083	out->content[out->use] = 0;
				2084	if (ret == -1) ret = -3;
				2085	}
				2086	#endif /* LIBXML_ICONV_ENABLED */
				2087	#ifdef DEBUG_ENCODING
				2088	switch (ret) {
				2089	case 0:
				2090	xmlGenericError(xmlGenericErrorContext,
				2091	"converted %d bytes to %d bytes of input\n",
				2092	toconv, written);
				2093	break;
				2094	case -1:
				2095	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				2096	toconv, written, in->use);
				2097	break;
				2098	case -2:
				2099	xmlGenericError(xmlGenericErrorContext,
				2100	"input conversion failed due to input error\n");
				2101	break;
				2102	case -3:
				2103	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
				2104	toconv, written, in->use);
				2105	break;
				2106	default:
				2107	xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
				2108	}
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2109	#endif /* DEBUG_ENCODING */
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2110	/*
				2111	* Ignore when input buffer is not on a boundary
				2112	*/
				2113	if (ret == -3) ret = 0;
				2114	if (ret == -1) ret = 0;
				2115	return(ret);
				2116	}
				2117
				2118	/**
				2119	* xmlCharEncInFunc:
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2120	* @handler: char encoding transformation data structure
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2121	* @out: an xmlBuffer for the output.
				2122	* @in: an xmlBuffer for the input
				2123	*
				2124	* Generic front-end for the encoding handler input function
				2125	*
				2126	* Returns the number of byte written if success, or
				2127	* -1 general error
				2128	* -2 if the transcoding fails (for *in is not valid utf8 string or
				2129	* the result of transformation can't fit into the encoding we want), or
				2130	*/
				2131	int
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2132	xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
				2133	xmlBufferPtr in)
				2134	{
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2135	int ret = -2;
				2136	int written;
				2137	int toconv;
				2138
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2139	if (handler == NULL)
				2140	return (-1);
				2141	if (out == NULL)
				2142	return (-1);
				2143	if (in == NULL)
				2144	return (-1);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2145
				2146	toconv = in->use;
				2147	if (toconv == 0)
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2148	return (0);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2149	written = out->size - out->use;
				2150	if (toconv * 2 >= written) {
				2151	xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2152	written = out->size - out->use - 1;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2153	}
				2154	if (handler->input != NULL) {
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2155	ret = handler->input(&out->content[out->use], &written,
				2156	in->content, &toconv);
				2157	xmlBufferShrink(in, toconv);
				2158	out->use += written;
				2159	out->content[out->use] = 0;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2160	}
				2161	#ifdef LIBXML_ICONV_ENABLED
				2162	else if (handler->iconv_in != NULL) {
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2163	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				2164	&written, in->content, &toconv);
				2165	xmlBufferShrink(in, toconv);
				2166	out->use += written;
				2167	out->content[out->use] = 0;
				2168	if (ret == -1)
				2169	ret = -3;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2170	}
				2171	#endif /* LIBXML_ICONV_ENABLED */
				2172	switch (ret) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2173	case 0:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2174	#ifdef DEBUG_ENCODING
				2175	xmlGenericError(xmlGenericErrorContext,
				2176	"converted %d bytes to %d bytes of input\n",
				2177	toconv, written);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2178	#endif
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2179	break;
				2180	case -1:
				2181	#ifdef DEBUG_ENCODING
				2182	xmlGenericError(xmlGenericErrorContext,
				2183	"converted %d bytes to %d bytes of input, %d left\n",
				2184	toconv, written, in->use);
				2185	#endif
				2186	break;
				2187	case -3:
				2188	#ifdef DEBUG_ENCODING
				2189	xmlGenericError(xmlGenericErrorContext,
				2190	"converted %d bytes to %d bytes of input, %d left\n",
				2191	toconv, written, in->use);
				2192	#endif
				2193	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2194	case -2:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2195	xmlGenericError(xmlGenericErrorContext,
				2196	"input conversion failed due to input error\n");
				2197	xmlGenericError(xmlGenericErrorContext,
				2198	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2199	in->content[0], in->content[1],
				2200	in->content[2], in->content[3]);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2201	}
				2202	/*
				2203	* Ignore when input buffer is not on a boundary
				2204	*/
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2205	if (ret == -3)
				2206	ret = 0;
Daniel Veillard	d076a20	2002-11-20 13:28:31 +0000	[diff] [blame]	2207	return (written);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2208	}
				2209
				2210	/**
				2211	* xmlCharEncOutFunc:
				2212	* @handler: char enconding transformation data structure
				2213	* @out: an xmlBuffer for the output.
				2214	* @in: an xmlBuffer for the input
				2215	*
				2216	* Generic front-end for the encoding handler output function
				2217	* a first call with @in == NULL has to be made firs to initiate the
				2218	* output in case of non-stateless encoding needing to initiate their
				2219	* state or the output (like the BOM in UTF16).
				2220	* In case of UTF8 sequence conversion errors for the given encoder,
				2221	* the content will be automatically remapped to a CharRef sequence.
				2222	*
				2223	* Returns the number of byte written if success, or
				2224	* -1 general error
				2225	* -2 if the transcoding fails (for *in is not valid utf8 string or
				2226	* the result of transformation can't fit into the encoding we want), or
				2227	*/
				2228	int
				2229	xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				2230	xmlBufferPtr in) {
				2231	int ret = -2;
				2232	int written;
				2233	int writtentot = 0;
				2234	int toconv;
				2235	int output = 0;
				2236
				2237	if (handler == NULL) return(-1);
				2238	if (out == NULL) return(-1);
				2239
				2240	retry:
				2241
				2242	written = out->size - out->use;
				2243
Igor Zlatkovic	73267db	2003-03-08 13:29:24 +0000	[diff] [blame]	2244	if (written > 0)
				2245	written--; /* Gennady: count '/0' */
				2246
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2247	/*
				2248	* First specific handling of in = NULL, i.e. the initialization call
				2249	*/
				2250	if (in == NULL) {
				2251	toconv = 0;
				2252	if (handler->output != NULL) {
				2253	ret = handler->output(&out->content[out->use], &written,
				2254	NULL, &toconv);
Daniel Veillard	8caa9c2	2003-06-02 13:35:24 +0000	[diff] [blame]	2255	if (ret >= 0) { /* Gennady: check return value */
Igor Zlatkovic	73267db	2003-03-08 13:29:24 +0000	[diff] [blame]	2256	out->use += written;
				2257	out->content[out->use] = 0;
				2258	}
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2259	}
				2260	#ifdef LIBXML_ICONV_ENABLED
				2261	else if (handler->iconv_out != NULL) {
				2262	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				2263	&written, NULL, &toconv);
				2264	out->use += written;
				2265	out->content[out->use] = 0;
				2266	}
				2267	#endif /* LIBXML_ICONV_ENABLED */
				2268	#ifdef DEBUG_ENCODING
				2269	xmlGenericError(xmlGenericErrorContext,
				2270	"initialized encoder\n");
				2271	#endif
				2272	return(0);
				2273	}
				2274
				2275	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2276	* Conversion itself.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2277	*/
				2278	toconv = in->use;
				2279	if (toconv == 0)
				2280	return(0);
				2281	if (toconv * 2 >= written) {
				2282	xmlBufferGrow(out, toconv * 2);
				2283	written = out->size - out->use - 1;
				2284	}
				2285	if (handler->output != NULL) {
				2286	ret = handler->output(&out->content[out->use], &written,
				2287	in->content, &toconv);
				2288	xmlBufferShrink(in, toconv);
				2289	out->use += written;
				2290	writtentot += written;
				2291	out->content[out->use] = 0;
				2292	}
				2293	#ifdef LIBXML_ICONV_ENABLED
				2294	else if (handler->iconv_out != NULL) {
				2295	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				2296	&written, in->content, &toconv);
				2297	xmlBufferShrink(in, toconv);
				2298	out->use += written;
				2299	writtentot += written;
				2300	out->content[out->use] = 0;
				2301	if (ret == -1) {
				2302	if (written > 0) {
				2303	/*
				2304	* Can be a limitation of iconv
				2305	*/
				2306	goto retry;
				2307	}
				2308	ret = -3;
				2309	}
				2310	}
				2311	#endif /* LIBXML_ICONV_ENABLED */
				2312	else {
				2313	xmlGenericError(xmlGenericErrorContext,
				2314	"xmlCharEncOutFunc: no output function !\n");
				2315	return(-1);
				2316	}
				2317
				2318	if (ret >= 0) output += ret;
				2319
				2320	/*
				2321	* Attempt to handle error cases
				2322	*/
				2323	switch (ret) {
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2324	case 0:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2325	#ifdef DEBUG_ENCODING
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2326	xmlGenericError(xmlGenericErrorContext,
				2327	"converted %d bytes to %d bytes of output\n",
				2328	toconv, written);
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2329	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2330	break;
				2331	case -1:
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2332	#ifdef DEBUG_ENCODING
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2333	xmlGenericError(xmlGenericErrorContext,
				2334	"output conversion failed by lack of space\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2335	#endif
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2336	break;
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2337	case -3:
Daniel Veillard	809faa5	2003-02-10 15:43:53 +0000	[diff] [blame]	2338	#ifdef DEBUG_ENCODING
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2339	xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
				2340	toconv, written, in->use);
Daniel Veillard	809faa5	2003-02-10 15:43:53 +0000	[diff] [blame]	2341	#endif
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2342	break;
				2343	case -2: {
				2344	int len = in->use;
				2345	const xmlChar utf = (const xmlChar ) in->content;
				2346	int cur;
				2347
				2348	cur = xmlGetUTF8Char(utf, &len);
				2349	if (cur > 0) {
				2350	xmlChar charref[20];
				2351
				2352	#ifdef DEBUG_ENCODING
				2353	xmlGenericError(xmlGenericErrorContext,
				2354	"handling output conversion error\n");
				2355	xmlGenericError(xmlGenericErrorContext,
				2356	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2357	in->content[0], in->content[1],
				2358	in->content[2], in->content[3]);
				2359	#endif
				2360	/*
				2361	* Removes the UTF8 sequence, and replace it by a charref
				2362	* and continue the transcoding phase, hoping the error
				2363	* did not mangle the encoder state.
				2364	*/
Aleksey Sanin	49cc975	2002-06-14 17:07:10 +0000	[diff] [blame]	2365	snprintf((char *) charref, sizeof(charref), "&#%d;", cur);
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2366	xmlBufferShrink(in, len);
				2367	xmlBufferAddHead(in, charref, -1);
				2368
				2369	goto retry;
				2370	} else {
				2371	xmlGenericError(xmlGenericErrorContext,
				2372	"output conversion failed due to conv error\n");
				2373	xmlGenericError(xmlGenericErrorContext,
				2374	"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				2375	in->content[0], in->content[1],
				2376	in->content[2], in->content[3]);
				2377	in->content[0] = ' ';
				2378	}
				2379	break;
				2380	}
				2381	}
				2382	return(ret);
				2383	}
				2384
				2385	/**
				2386	* xmlCharEncCloseFunc:
				2387	* @handler: char enconding transformation data structure
				2388	*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2389	* Generic front-end for encoding handler close function
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2390	*
				2391	* Returns 0 if success, or -1 in case of error
				2392	*/
				2393	int
				2394	xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
				2395	int ret = 0;
				2396	if (handler == NULL) return(-1);
				2397	if (handler->name == NULL) return(-1);
				2398	#ifdef LIBXML_ICONV_ENABLED
				2399	/*
Daniel Veillard	cbaf399	2001-12-31 16:16:02 +0000	[diff] [blame]	2400	* Iconv handlers can be used only once, free the whole block.
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2401	* and the associated icon resources.
				2402	*/
				2403	if ((handler->iconv_out != NULL) \|\| (handler->iconv_in != NULL)) {
				2404	if (handler->name != NULL)
				2405	xmlFree(handler->name);
				2406	handler->name = NULL;
				2407	if (handler->iconv_out != NULL) {
				2408	if (iconv_close(handler->iconv_out))
				2409	ret = -1;
				2410	handler->iconv_out = NULL;
				2411	}
				2412	if (handler->iconv_in != NULL) {
				2413	if (iconv_close(handler->iconv_in))
				2414	ret = -1;
				2415	handler->iconv_in = NULL;
				2416	}
				2417	xmlFree(handler);
				2418	}
				2419	#endif /* LIBXML_ICONV_ENABLED */
				2420	#ifdef DEBUG_ENCODING
				2421	if (ret)
				2422	xmlGenericError(xmlGenericErrorContext,
				2423	"failed to close the encoding handler\n");
				2424	else
				2425	xmlGenericError(xmlGenericErrorContext,
				2426	"closed the encoding handler\n");
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2427	#endif
Daniel Veillard	d79bcd1	2001-06-21 22:07:42 +0000	[diff] [blame]	2428
Owen Taylor	3473f88	2001-02-23 17:55:21 +0000	[diff] [blame]	2429	return(ret);
				2430	}
				2431