Blame - encoding.c - platform/external/libxml2

blob: 3031ce8c8b99c412b1f7fa7af1b3f4cbb151c837 [file] [log] [blame]

Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	1	/*
				2	* encoding.c : implements the encoding conversion functions needed for XML
				3	*
				4	* Related specs:
				5	* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	6	* rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	7	* [ISO-10646] UTF-8 and UTF-16 in Annexes
				8	* [ISO-8859-1] ISO Latin-1 characters codes.
				9	* [UNICODE] The Unicode Consortium, "The Unicode Standard --
				10	* Worldwide Character Encoding -- Version 1.0", Addison-
				11	* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
				12	* described in Unicode Technical Report #4.
				13	* [US-ASCII] Coded Character Set--7-bit American Standard Code for
				14	* Information Interchange, ANSI X3.4-1986.
				15	*
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	16	* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	17	*
				18	* See Copyright for the status of this software.
				19	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	20	* Daniel.Veillard@w3.org
				21	*/
				22
Daniel Veillard	3c558c3	1999-12-22 11:30:41 +0000	[diff] [blame]	23	#ifdef WIN32
				24	#include "win32config.h"
				25	#else
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	26	#include "config.h"
Daniel Veillard	7f7d111	1999-09-22 09:46:25 +0000	[diff] [blame]	27	#endif
				28
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	29	#include <stdio.h>
Daniel Veillard	7f7d111	1999-09-22 09:46:25 +0000	[diff] [blame]	30	#include <string.h>
				31
				32	#ifdef HAVE_CTYPE_H
				33	#include <ctype.h>
				34	#endif
Daniel Veillard	6d3bf1f	1999-12-16 17:52:19 +0000	[diff] [blame]	35	#ifdef HAVE_STDLIB_H
				36	#include <stdlib.h>
				37	#endif
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	38	#include <libxml/xmlversion.h>
				39	#ifdef LIBXML_ICONV_ENABLED
				40	#ifdef HAVE_ERRNO_H
				41	#include <errno.h>
				42	#endif
				43	#endif
Daniel Veillard	361d845	2000-04-03 19:48:13 +0000	[diff] [blame]	44	#include <libxml/encoding.h>
				45	#include <libxml/xmlmemory.h>
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	46	#ifdef LIBXML_HTML_ENABLED
				47	#include <libxml/HTMLparser.h>
				48	#endif
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	49
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	50	xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
				51	xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Daniel Veillard	b05deb7	1999-08-10 19:04:08 +0000	[diff] [blame]	52
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	53	#ifdef LIBXML_ICONV_ENABLED
				54	#if 0
				55	#define DEBUG_ENCODING /* Define this to get encoding traces */
				56	#endif
				57	#endif
				58
				59	static int xmlLittleEndian = 1;
				60
Daniel Veillard	0ba4d53	1998-11-01 19:34:31 +0000	[diff] [blame]	61	/*
				62	* From rfc2044: encoding of the Unicode values on UTF-8:
				63	*
				64	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				65	* 0000 0000-0000 007F 0xxxxxxx
				66	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				67	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				68	*
				69	* I hope we won't use values > 0xFFFF anytime soon !
				70	*/
				71
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	72	/**
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	73	* xmlGetUTF8Char:
				74	* @utf: a sequence of UTF-8 encoded bytes
				75	* @len: a pointer to @bytes len
				76	*
				77	* Read one UTF8 Char from @utf
				78	*
				79	* Returns the char value or -1 in case of error and update @len with the
				80	* number of bytes used
				81	*/
				82	int
				83	xmlGetUTF8Char(const unsigned char utf, int len) {
				84	unsigned int c;
				85
				86	if (utf == NULL)
				87	goto error;
				88	if (len == NULL)
				89	goto error;
				90	if (*len < 1)
				91	goto error;
				92
				93	c = utf[0];
				94	if (c & 0x80) {
				95	if (*len < 2)
				96	goto error;
				97	if ((utf[1] & 0xc0) != 0x80)
				98	goto error;
				99	if ((c & 0xe0) == 0xe0) {
				100	if (*len < 3)
				101	goto error;
				102	if ((utf[2] & 0xc0) != 0x80)
				103	goto error;
				104	if ((c & 0xf0) == 0xf0) {
				105	if (*len < 4)
				106	goto error;
				107	if ((c & 0xf8) != 0xf0 \|\| (utf[3] & 0xc0) != 0x80)
				108	goto error;
				109	*len = 4;
				110	/* 4-byte code */
				111	c = (utf[0] & 0x7) << 18;
				112	c \|= (utf[1] & 0x3f) << 12;
				113	c \|= (utf[2] & 0x3f) << 6;
				114	c \|= utf[3] & 0x3f;
				115	} else {
				116	/* 3-byte code */
				117	*len = 3;
				118	c = (utf[0] & 0xf) << 12;
				119	c \|= (utf[1] & 0x3f) << 6;
				120	c \|= utf[2] & 0x3f;
				121	}
				122	} else {
				123	/* 2-byte code */
				124	*len = 2;
				125	c = (utf[0] & 0x1f) << 6;
				126	c \|= utf[1] & 0x3f;
				127	}
				128	} else {
				129	/* 1-byte code */
				130	*len = 1;
				131	}
				132	return(c);
				133
				134	error:
				135	*len = 0;
				136	return(-1);
				137	}
				138
				139	/**
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	140	* xmlCheckUTF8: Check utf-8 string for legality.
				141	* @utf: Pointer to putative utf-8 encoded string.
				142	*
				143	* Checks @utf for being valid utf-8. @utf is assumed to be
				144	* null-terminated. This function is not super-strict, as it will
				145	* allow longer utf-8 sequences than necessary. Note that Java is
				146	* capable of producing these sequences if provoked. Also note, this
				147	* routine checks for the 4-byte maxiumum size, but does not check for
				148	* 0x10ffff maximum value.
				149	*
				150	* Return value: true if @utf is valid.
				151	**/
				152	int
				153	xmlCheckUTF8(const unsigned char *utf)
				154	{
				155	int ix;
				156	unsigned char c;
				157
				158	for (ix = 0; (c = utf[ix]);) {
				159	if (c & 0x80) {
				160	if ((utf[ix + 1] & 0xc0) != 0x80)
				161	return(0);
				162	if ((c & 0xe0) == 0xe0) {
				163	if ((utf[ix + 2] & 0xc0) != 0x80)
				164	return(0);
				165	if ((c & 0xf0) == 0xf0) {
				166	if ((c & 0xf8) != 0xf0 \|\| (utf[ix + 3] & 0xc0) != 0x80)
				167	return(0);
				168	ix += 4;
				169	/* 4-byte code */
				170	} else
				171	/* 3-byte code */
				172	ix += 3;
				173	} else
				174	/* 2-byte code */
				175	ix += 2;
				176	} else
				177	/* 1-byte code */
				178	ix++;
				179	}
				180	return(1);
				181	}
				182
				183	/**
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	184	* asciiToUTF8:
				185	* @out: a pointer to an array of bytes to store the result
				186	* @outlen: the length of @out
				187	* @in: a pointer to an array of ASCII chars
				188	* @inlen: the length of @in
				189	*
				190	* Take a block of ASCII chars in and try to convert it to an UTF-8
				191	* block of chars out.
				192	* Returns 0 if success, or -1 otherwise
				193	* The value of @inlen after return is the number of octets consumed
				194	* as the return value is positive, else unpredictiable.
				195	* The value of @outlen after return is the number of ocetes consumed.
				196	*/
				197	int
				198	asciiToUTF8(unsigned char* out, int *outlen,
				199	const unsigned char* in, int *inlen) {
				200	unsigned char* outstart = out;
				201	const unsigned char* base = in;
				202	const unsigned char* processed = in;
				203	unsigned char* outend = out + *outlen;
				204	const unsigned char* inend;
				205	unsigned int c;
				206	int bits;
				207
				208	inend = in + (*inlen);
				209	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				210	c= *in++;
				211
				212	/* assertion: c is a single UTF-4 value */
				213	if (out >= outend)
				214	break;
				215	if (c < 0x80) { *out++= c; bits= -6; }
				216	else {
				217	*outlen = out - outstart;
				218	*inlen = processed - base;
				219	return(-1);
				220	}
				221
				222	for ( ; bits >= 0; bits-= 6) {
				223	if (out >= outend)
				224	break;
				225	*out++= ((c >> bits) & 0x3F) \| 0x80;
				226	}
				227	processed = (const unsigned char*) in;
				228	}
				229	*outlen = out - outstart;
				230	*inlen = processed - base;
				231	return(0);
				232	}
				233
				234	/**
				235	* UTF8Toascii:
				236	* @out: a pointer to an array of bytes to store the result
				237	* @outlen: the length of @out
				238	* @in: a pointer to an array of UTF-8 chars
				239	* @inlen: the length of @in
				240	*
				241	* Take a block of UTF-8 chars in and try to convert it to an ASCII
				242	* block of chars out.
				243	*
				244	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
				245	* The value of @inlen after return is the number of octets consumed
				246	* as the return value is positive, else unpredictiable.
				247	* The value of @outlen after return is the number of ocetes consumed.
				248	*/
				249	int
				250	UTF8Toascii(unsigned char* out, int *outlen,
				251	const unsigned char* in, int *inlen) {
				252	const unsigned char* processed = in;
				253	const unsigned char* outend;
				254	const unsigned char* outstart = out;
				255	const unsigned char* instart = in;
				256	const unsigned char* inend;
				257	unsigned int c, d;
				258	int trailing;
				259
				260	if (in == NULL) {
				261	/*
				262	* initialization nothing to do
				263	*/
				264	*outlen = 0;
				265	*inlen = 0;
				266	return(0);
				267	}
				268	inend = in + (*inlen);
				269	outend = out + (*outlen);
				270	while (in < inend) {
				271	d = *in++;
				272	if (d < 0x80) { c= d; trailing= 0; }
				273	else if (d < 0xC0) {
				274	/* trailing byte in leading position */
				275	*outlen = out - outstart;
				276	*inlen = processed - instart;
				277	return(-2);
				278	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				279	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				280	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				281	else {
				282	/* no chance for this in Ascii */
				283	*outlen = out - outstart;
				284	*inlen = processed - instart;
				285	return(-2);
				286	}
				287
				288	if (inend - in < trailing) {
				289	break;
				290	}
				291
				292	for ( ; trailing; trailing--) {
				293	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				294	break;
				295	c <<= 6;
				296	c \|= d & 0x3F;
				297	}
				298
				299	/* assertion: c is a single UTF-4 value */
				300	if (c < 0x80) {
				301	if (out >= outend)
				302	break;
				303	*out++ = c;
				304	} else {
				305	/* no chance for this in Ascii */
				306	*outlen = out - outstart;
				307	*inlen = processed - instart;
				308	return(-2);
				309	}
				310	processed = in;
				311	}
				312	*outlen = out - outstart;
				313	*inlen = processed - instart;
				314	return(0);
				315	}
				316
				317	/**
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	318	* isolat1ToUTF8:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	319	* @out: a pointer to an array of bytes to store the result
				320	* @outlen: the length of @out
				321	* @in: a pointer to an array of ISO Latin 1 chars
				322	* @inlen: the length of @in
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	323	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	324	* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
				325	* block of chars out.
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	326	* Returns 0 if success, or -1 otherwise
				327	* The value of @inlen after return is the number of octets consumed
				328	* as the return value is positive, else unpredictiable.
				329	* The value of @outlen after return is the number of ocetes consumed.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	330	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	331	int
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	332	isolat1ToUTF8(unsigned char* out, int *outlen,
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	333	const unsigned char* in, int *inlen) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	334	unsigned char* outstart = out;
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	335	const unsigned char* base = in;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	336	const unsigned char* processed = in;
				337	unsigned char* outend = out + *outlen;
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	338	const unsigned char* inend;
				339	unsigned int c;
				340	int bits;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	341
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	342	inend = in + (*inlen);
				343	while ((in < inend) && (out - outstart + 5 < *outlen)) {
				344	c= *in++;
				345
				346	/* assertion: c is a single UTF-4 value */
				347	if (out >= outend)
				348	break;
				349	if (c < 0x80) { *out++= c; bits= -6; }
				350	else { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				351
				352	for ( ; bits >= 0; bits-= 6) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	353	if (out >= outend)
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	354	break;
				355	*out++= ((c >> bits) & 0x3F) \| 0x80;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	356	}
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	357	processed = (const unsigned char*) in;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	358	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	359	*outlen = out - outstart;
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	360	*inlen = processed - base;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	361	return(0);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	362	}
				363
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	364	/**
				365	* UTF8Toisolat1:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	366	* @out: a pointer to an array of bytes to store the result
				367	* @outlen: the length of @out
				368	* @in: a pointer to an array of UTF-8 chars
				369	* @inlen: the length of @in
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	370	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	371	* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
				372	* block of chars out.
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	373	*
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	374	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	375	* The value of @inlen after return is the number of octets consumed
				376	* as the return value is positive, else unpredictiable.
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	377	* The value of @outlen after return is the number of ocetes consumed.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	378	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	379	int
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	380	UTF8Toisolat1(unsigned char* out, int *outlen,
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	381	const unsigned char* in, int *inlen) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	382	const unsigned char* processed = in;
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	383	const unsigned char* outend;
				384	const unsigned char* outstart = out;
				385	const unsigned char* instart = in;
				386	const unsigned char* inend;
				387	unsigned int c, d;
				388	int trailing;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	389
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	390	if (in == NULL) {
				391	/*
				392	* initialization nothing to do
				393	*/
				394	*outlen = 0;
				395	*inlen = 0;
				396	return(0);
				397	}
				398	inend = in + (*inlen);
				399	outend = out + (*outlen);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	400	while (in < inend) {
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	401	d = *in++;
				402	if (d < 0x80) { c= d; trailing= 0; }
				403	else if (d < 0xC0) {
				404	/* trailing byte in leading position */
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	405	*outlen = out - outstart;
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	406	*inlen = processed - instart;
				407	return(-2);
				408	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				409	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				410	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				411	else {
				412	/* no chance for this in IsoLat1 */
				413	*outlen = out - outstart;
				414	*inlen = processed - instart;
				415	return(-2);
				416	}
				417
				418	if (inend - in < trailing) {
				419	break;
				420	}
				421
				422	for ( ; trailing; trailing--) {
Daniel Veillard	87b9539	2000-08-12 21:12:04 +0000	[diff] [blame^]	423	if (in >= inend)
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	424	break;
Daniel Veillard	87b9539	2000-08-12 21:12:04 +0000	[diff] [blame^]	425	if (((d= *in++) & 0xC0) != 0x80) {
				426	*outlen = out - outstart;
				427	*inlen = processed - instart;
				428	return(-2);
				429	}
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	430	c <<= 6;
				431	c \|= d & 0x3F;
				432	}
				433
				434	/* assertion: c is a single UTF-4 value */
				435	if (c <= 0xFF) {
				436	if (out >= outend)
				437	break;
				438	*out++ = c;
				439	} else {
				440	/* no chance for this in IsoLat1 */
				441	*outlen = out - outstart;
				442	*inlen = processed - instart;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	443	return(-2);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	444	}
				445	processed = in;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	446	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	447	*outlen = out - outstart;
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	448	*inlen = processed - instart;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	449	return(0);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	450	}
				451
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	452	/**
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	453	* UTF16LEToUTF8:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	454	* @out: a pointer to an array of bytes to store the result
				455	* @outlen: the length of @out
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	456	* @inb: a pointer to an array of UTF-16LE passwd as a byte array
				457	* @inlenb: the length of @in in UTF-16LE chars
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	458	*
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	459	* Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
				460	* block of chars out. This function assume the endian properity
				461	* is the same between the native type of this machine and the
				462	* inputed one.
				463	*
				464	* Returns the number of byte written, or -1 by lack of space, or -2
				465	* if the transcoding fails (for *in is not valid utf16 string)
				466	* The value of *inlen after return is the number of octets consumed
				467	* as the return value is positive, else unpredictiable.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	468	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	469	int
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	470	UTF16LEToUTF8(unsigned char* out, int *outlen,
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	471	const unsigned char* inb, int *inlenb)
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	472	{
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	473	unsigned char* outstart = out;
				474	const unsigned char* processed = inb;
				475	unsigned char* outend = out + *outlen;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	476	unsigned short* in = (unsigned short*) inb;
				477	unsigned short* inend;
				478	unsigned int c, d, inlen;
				479	unsigned char *tmp;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	480	int bits;
				481
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	482	if ((*inlenb % 2) == 1)
				483	(*inlenb)--;
				484	inlen = *inlenb / 2;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	485	inend = in + inlen;
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	486	while ((in < inend) && (out - outstart + 5 < *outlen)) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	487	if (xmlLittleEndian) {
				488	c= *in++;
				489	} else {
				490	tmp = (unsigned char *) in;
				491	c = *tmp++;
				492	c = c \| (((unsigned int)*tmp) << 8);
				493	in++;
				494	}
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	495	if ((c & 0xFC00) == 0xD800) { /* surrogates */
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	496	if (in >= inend) { /* (in > inend) shouldn't happens */
				497	break;
				498	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	499	if (xmlLittleEndian) {
				500	d = *in++;
				501	} else {
				502	tmp = (unsigned char *) in;
				503	d = *tmp++;
				504	d = d \| (((unsigned int)*tmp) << 8);
				505	in++;
				506	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	507	if ((d & 0xFC00) == 0xDC00) {
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	508	c &= 0x03FF;
				509	c <<= 10;
				510	c \|= d & 0x03FF;
				511	c += 0x10000;
				512	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	513	else {
				514	*outlen = out - outstart;
				515	*inlenb = processed - inb;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	516	return(-2);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	517	}
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	518	}
				519
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	520	/* assertion: c is a single UTF-4 value */
				521	if (out >= outend)
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	522	break;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	523	if (c < 0x80) { *out++= c; bits= -6; }
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	524	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				525	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				526	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	527
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	528	for ( ; bits >= 0; bits-= 6) {
				529	if (out >= outend)
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	530	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	531	*out++= ((c >> bits) & 0x3F) \| 0x80;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	532	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	533	processed = (const unsigned char*) in;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	534	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	535	*outlen = out - outstart;
				536	*inlenb = processed - inb;
				537	return(0);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	538	}
				539
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	540	/**
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	541	* UTF8ToUTF16LE:
				542	* @outb: a pointer to an array of bytes to store the result
				543	* @outlen: the length of @outb
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	544	* @in: a pointer to an array of UTF-8 chars
				545	* @inlen: the length of @in
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	546	*
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	547	* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	548	* block of chars out.
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	549	*
Daniel Veillard	1e346af	1999-02-22 10:33:01 +0000	[diff] [blame]	550	* Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	551	* if the transcoding failed.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	552	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	553	int
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	554	UTF8ToUTF16LE(unsigned char* outb, int *outlen,
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	555	const unsigned char* in, int *inlen)
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	556	{
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	557	unsigned short* out = (unsigned short*) outb;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	558	const unsigned char* processed = in;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	559	unsigned short* outstart= out;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	560	unsigned short* outend;
				561	const unsigned char* inend= in+*inlen;
Daniel Veillard	3f6f7f6	2000-06-30 17:58:25 +0000	[diff] [blame]	562	unsigned int c, d;
				563	int trailing;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	564	unsigned char *tmp;
				565	unsigned short tmp1, tmp2;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	566
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	567	if (in == NULL) {
				568	/*
				569	* initialization, add the Byte Order Mark
				570	*/
				571	if (*outlen >= 2) {
				572	outb[0] = 0xFF;
				573	outb[1] = 0xFE;
				574	*outlen = 2;
				575	*inlen = 0;
				576	#ifdef DEBUG_ENCODING
				577	fprintf(stderr, "Added FFFE Byte Order Mark\n");
				578	#endif
				579	return(2);
				580	}
				581	*outlen = 0;
				582	*inlen = 0;
				583	return(0);
				584	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	585	outend = out + (*outlen / 2);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	586	while (in < inend) {
				587	d= *in++;
				588	if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	589	else if (d < 0xC0) {
				590	/* trailing byte in leading position */
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	591	outlen = (out - outstart) 2;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	592	*inlen = processed - in;
				593	return(-2);
				594	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	595	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				596	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	597	else {
				598	/* no chance for this in UTF-16 */
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	599	outlen = (out - outstart) 2;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	600	*inlen = processed - in;
				601	return(-2);
				602	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	603
				604	if (inend - in < trailing) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	605	break;
				606	}
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	607
				608	for ( ; trailing; trailing--) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	609	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	610	break;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	611	c <<= 6;
				612	c \|= d & 0x3F;
				613	}
				614
				615	/* assertion: c is a single UTF-4 value */
				616	if (c < 0x10000) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	617	if (out >= outend)
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	618	break;
				619	if (xmlLittleEndian) {
				620	*out++ = c;
				621	} else {
				622	tmp = (unsigned char *) out;
				623	*tmp = c ;
				624	*(tmp + 1) = c >> 8 ;
				625	out++;
				626	}
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	627	}
				628	else if (c < 0x110000) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	629	if (out+1 >= outend)
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	630	break;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	631	c -= 0x10000;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	632	if (xmlLittleEndian) {
				633	*out++ = 0xD800 \| (c >> 10);
				634	*out++ = 0xDC00 \| (c & 0x03FF);
				635	} else {
				636	tmp1 = 0xD800 \| (c >> 10);
				637	tmp = (unsigned char *) out;
Daniel Veillard	3f6f7f6	2000-06-30 17:58:25 +0000	[diff] [blame]	638	*tmp = (unsigned char) tmp1;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	639	*(tmp + 1) = tmp1 >> 8;
				640	out++;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	641
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	642	tmp2 = 0xDC00 \| (c & 0x03FF);
				643	tmp = (unsigned char *) out;
Daniel Veillard	3f6f7f6	2000-06-30 17:58:25 +0000	[diff] [blame]	644	*tmp = (unsigned char) tmp2;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	645	*(tmp + 1) = tmp2 >> 8;
				646	out++;
				647	}
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	648	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	649	else
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	650	break;
				651	processed = in;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	652	}
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	653	outlen = (out - outstart) 2;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	654	*inlen = processed - in;
				655	return(0);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	656	}
				657
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	658	/**
				659	* UTF16BEToUTF8:
				660	* @out: a pointer to an array of bytes to store the result
				661	* @outlen: the length of @out
				662	* @inb: a pointer to an array of UTF-16 passwd as a byte array
				663	* @inlenb: the length of @in in UTF-16 chars
				664	*
				665	* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
				666	* block of chars out. This function assume the endian properity
				667	* is the same between the native type of this machine and the
				668	* inputed one.
				669	*
				670	* Returns the number of byte written, or -1 by lack of space, or -2
				671	* if the transcoding fails (for *in is not valid utf16 string)
				672	* The value of *inlen after return is the number of octets consumed
				673	* as the return value is positive, else unpredictiable.
				674	*/
				675	int
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	676	UTF16BEToUTF8(unsigned char* out, int *outlen,
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	677	const unsigned char* inb, int *inlenb)
				678	{
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	679	unsigned char* outstart = out;
				680	const unsigned char* processed = inb;
				681	unsigned char* outend = out + *outlen;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	682	unsigned short* in = (unsigned short*) inb;
				683	unsigned short* inend;
				684	unsigned int c, d, inlen;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	685	unsigned char *tmp;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	686	int bits;
				687
				688	if ((*inlenb % 2) == 1)
				689	(*inlenb)--;
				690	inlen = *inlenb / 2;
				691	inend= in + inlen;
				692	while (in < inend) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	693	if (xmlLittleEndian) {
				694	tmp = (unsigned char *) in;
				695	c = *tmp++;
				696	c = c << 8;
				697	c = c \| (unsigned int) *tmp;
				698	in++;
				699	} else {
				700	c= *in++;
				701	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	702	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				703	if (in >= inend) { /* (in > inend) shouldn't happens */
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	704	*outlen = out - outstart;
				705	*inlenb = processed - inb;
				706	return(-2);
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	707	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	708	if (xmlLittleEndian) {
				709	tmp = (unsigned char *) in;
				710	d = *tmp++;
				711	d = d << 8;
				712	d = d \| (unsigned int) *tmp;
				713	in++;
				714	} else {
				715	d= *in++;
				716	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	717	if ((d & 0xFC00) == 0xDC00) {
				718	c &= 0x03FF;
				719	c <<= 10;
				720	c \|= d & 0x03FF;
				721	c += 0x10000;
				722	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	723	else {
				724	*outlen = out - outstart;
				725	*inlenb = processed - inb;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	726	return(-2);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	727	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	728	}
				729
				730	/* assertion: c is a single UTF-4 value */
				731	if (out >= outend)
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	732	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	733	if (c < 0x80) { *out++= c; bits= -6; }
				734	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				735	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				736	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				737
				738	for ( ; bits >= 0; bits-= 6) {
				739	if (out >= outend)
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	740	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	741	*out++= ((c >> bits) & 0x3F) \| 0x80;
				742	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	743	processed = (const unsigned char*) in;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	744	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	745	*outlen = out - outstart;
				746	*inlenb = processed - inb;
				747	return(0);
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	748	}
				749
				750	/**
				751	* UTF8ToUTF16BE:
				752	* @outb: a pointer to an array of bytes to store the result
				753	* @outlen: the length of @outb
				754	* @in: a pointer to an array of UTF-8 chars
				755	* @inlen: the length of @in
				756	*
				757	* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
				758	* block of chars out.
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	759	*
				760	* Returns the number of byte written, or -1 by lack of space, or -2
				761	* if the transcoding failed.
				762	*/
				763	int
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	764	UTF8ToUTF16BE(unsigned char* outb, int *outlen,
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	765	const unsigned char* in, int *inlen)
				766	{
				767	unsigned short* out = (unsigned short*) outb;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	768	const unsigned char* processed = in;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	769	unsigned short* outstart= out;
				770	unsigned short* outend;
				771	const unsigned char* inend= in+*inlen;
Daniel Veillard	3f6f7f6	2000-06-30 17:58:25 +0000	[diff] [blame]	772	unsigned int c, d;
				773	int trailing;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	774	unsigned char *tmp;
				775	unsigned short tmp1, tmp2;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	776
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	777	if (in == NULL) {
				778	/*
				779	* initialization, add the Byte Order Mark
				780	*/
				781	if (*outlen >= 2) {
				782	outb[0] = 0xFE;
				783	outb[1] = 0xFF;
				784	*outlen = 2;
				785	*inlen = 0;
				786	#ifdef DEBUG_ENCODING
				787	fprintf(stderr, "Added FEFF Byte Order Mark\n");
				788	#endif
				789	return(2);
				790	}
				791	*outlen = 0;
				792	*inlen = 0;
				793	return(0);
				794	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	795	outend = out + (*outlen / 2);
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	796	while (in < inend) {
				797	d= *in++;
				798	if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	799	else if (d < 0xC0) {
				800	/* trailing byte in leading position */
				801	*outlen = out - outstart;
				802	*inlen = processed - in;
				803	return(-2);
				804	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	805	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				806	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	807	else {
				808	/* no chance for this in UTF-16 */
				809	*outlen = out - outstart;
				810	*inlen = processed - in;
				811	return(-2);
				812	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	813
				814	if (inend - in < trailing) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	815	break;
				816	}
				817
				818	for ( ; trailing; trailing--) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	819	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80)) break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	820	c <<= 6;
				821	c \|= d & 0x3F;
				822	}
				823
				824	/* assertion: c is a single UTF-4 value */
				825	if (c < 0x10000) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	826	if (out >= outend) break;
				827	if (xmlLittleEndian) {
				828	tmp = (unsigned char *) out;
				829	*tmp = c >> 8;
				830	*(tmp + 1) = c;
				831	out++;
				832	} else {
				833	*out++ = c;
				834	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	835	}
				836	else if (c < 0x110000) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	837	if (out+1 >= outend) break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	838	c -= 0x10000;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	839	if (xmlLittleEndian) {
				840	tmp1 = 0xD800 \| (c >> 10);
				841	tmp = (unsigned char *) out;
				842	*tmp = tmp1 >> 8;
Daniel Veillard	3f6f7f6	2000-06-30 17:58:25 +0000	[diff] [blame]	843	*(tmp + 1) = (unsigned char) tmp1;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	844	out++;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	845
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	846	tmp2 = 0xDC00 \| (c & 0x03FF);
				847	tmp = (unsigned char *) out;
				848	*tmp = tmp2 >> 8;
Daniel Veillard	3f6f7f6	2000-06-30 17:58:25 +0000	[diff] [blame]	849	*(tmp + 1) = (unsigned char) tmp2;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	850	out++;
				851	} else {
				852	*out++ = 0xD800 \| (c >> 10);
				853	*out++ = 0xDC00 \| (c & 0x03FF);
				854	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	855	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	856	else
				857	break;
				858	processed = in;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	859	}
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	860	outlen = (out - outstart) 2;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	861	*inlen = processed - in;
				862	return(0);
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	863	}
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	864
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	865	/**
				866	* xmlDetectCharEncoding:
				867	* @in: a pointer to the first bytes of the XML entity, must be at least
				868	* 4 bytes long.
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	869	* @len: pointer to the length of the buffer
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	870	*
				871	* Guess the encoding of the entity using the first bytes of the entity content
				872	* accordingly of the non-normative appendix F of the XML-1.0 recommendation.
				873	*
				874	* Returns one of the XML_CHAR_ENCODING_... values.
				875	*/
				876	xmlCharEncoding
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	877	xmlDetectCharEncoding(const unsigned char* in, int len)
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	878	{
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	879	if (len >= 4) {
				880	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				881	(in[2] == 0x00) && (in[3] == 0x3C))
				882	return(XML_CHAR_ENCODING_UCS4BE);
				883	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
				884	(in[2] == 0x00) && (in[3] == 0x00))
				885	return(XML_CHAR_ENCODING_UCS4LE);
				886	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				887	(in[2] == 0x3C) && (in[3] == 0x00))
				888	return(XML_CHAR_ENCODING_UCS4_2143);
				889	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
				890	(in[2] == 0x00) && (in[3] == 0x00))
				891	return(XML_CHAR_ENCODING_UCS4_3412);
				892	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
				893	(in[2] == 0xA7) && (in[3] == 0x94))
				894	return(XML_CHAR_ENCODING_EBCDIC);
				895	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
				896	(in[2] == 0x78) && (in[3] == 0x6D))
				897	return(XML_CHAR_ENCODING_UTF8);
				898	}
				899	if (len >= 2) {
				900	if ((in[0] == 0xFE) && (in[1] == 0xFF))
				901	return(XML_CHAR_ENCODING_UTF16BE);
				902	if ((in[0] == 0xFF) && (in[1] == 0xFE))
				903	return(XML_CHAR_ENCODING_UTF16LE);
				904	}
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	905	return(XML_CHAR_ENCODING_NONE);
				906	}
				907
				908	/**
				909	* xmlParseCharEncoding:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	910	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	911	*
				912	* Conpare the string to the known encoding schemes already known. Note
				913	* that the comparison is case insensitive accordingly to the section
				914	* [XML] 4.3.3 Character Encoding in Entities.
				915	*
				916	* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
				917	* if not recognized.
				918	*/
				919	xmlCharEncoding
Daniel Veillard	011b63c	1999-06-02 17:44:04 +0000	[diff] [blame]	920	xmlParseCharEncoding(const char* name)
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	921	{
				922	char upper[500];
				923	int i;
				924
				925	for (i = 0;i < 499;i++) {
				926	upper[i] = toupper(name[i]);
				927	if (upper[i] == 0) break;
				928	}
				929	upper[i] = 0;
				930
				931	if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
				932	if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
				933	if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
				934
				935	/*
				936	* NOTE: if we were able to parse this, the endianness of UTF16 is
				937	* already found and in use
				938	*/
				939	if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
				940	if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
				941
				942	if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				943	if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				944	if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
				945
				946	/*
				947	* NOTE: if we were able to parse this, the endianness of UCS4 is
				948	* already found and in use
				949	*/
				950	if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				951	if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				952	if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
				953
				954
				955	if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
				956	if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
				957	if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
				958
				959	if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
				960	if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
				961	if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
				962
				963	if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
				964	if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
				965	if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
				966	if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
				967	if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
				968	if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
				969	if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
				970
				971	if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	972	if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	973	if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	974
				975	#ifdef DEBUG_ENCODING
				976	fprintf(stderr, "Unknown encoding %s\n", name);
				977	#endif
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	978	return(XML_CHAR_ENCODING_ERROR);
				979	}
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	980
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	981	/**
				982	* xmlGetCharEncodingName:
				983	* @enc: the encoding
				984	*
				985	* The "canonical" name for XML encoding.
				986	* C.f. http://www.w3.org/TR/REC-xml#charencoding
				987	* Section 4.3.3 Character Encoding in Entities
				988	*
				989	* Returns the canonical name for the given encoding
				990	*/
				991
				992	const char*
				993	xmlGetCharEncodingName(xmlCharEncoding enc) {
				994	switch (enc) {
				995	case XML_CHAR_ENCODING_ERROR:
				996	return(NULL);
				997	case XML_CHAR_ENCODING_NONE:
				998	return(NULL);
				999	case XML_CHAR_ENCODING_UTF8:
				1000	return("UTF-8");
				1001	case XML_CHAR_ENCODING_UTF16LE:
				1002	return("UTF-16");
				1003	case XML_CHAR_ENCODING_UTF16BE:
				1004	return("UTF-16");
				1005	case XML_CHAR_ENCODING_EBCDIC:
				1006	return("EBCDIC");
				1007	case XML_CHAR_ENCODING_UCS4LE:
				1008	return("ISO-10646-UCS-4");
				1009	case XML_CHAR_ENCODING_UCS4BE:
				1010	return("ISO-10646-UCS-4");
				1011	case XML_CHAR_ENCODING_UCS4_2143:
				1012	return("ISO-10646-UCS-4");
				1013	case XML_CHAR_ENCODING_UCS4_3412:
				1014	return("ISO-10646-UCS-4");
				1015	case XML_CHAR_ENCODING_UCS2:
				1016	return("ISO-10646-UCS-2");
				1017	case XML_CHAR_ENCODING_8859_1:
				1018	return("ISO-8859-1");
				1019	case XML_CHAR_ENCODING_8859_2:
				1020	return("ISO-8859-2");
				1021	case XML_CHAR_ENCODING_8859_3:
				1022	return("ISO-8859-3");
				1023	case XML_CHAR_ENCODING_8859_4:
				1024	return("ISO-8859-4");
				1025	case XML_CHAR_ENCODING_8859_5:
				1026	return("ISO-8859-5");
				1027	case XML_CHAR_ENCODING_8859_6:
				1028	return("ISO-8859-6");
				1029	case XML_CHAR_ENCODING_8859_7:
				1030	return("ISO-8859-7");
				1031	case XML_CHAR_ENCODING_8859_8:
				1032	return("ISO-8859-8");
				1033	case XML_CHAR_ENCODING_8859_9:
				1034	return("ISO-8859-9");
				1035	case XML_CHAR_ENCODING_2022_JP:
				1036	return("ISO-2022-JP");
				1037	case XML_CHAR_ENCODING_SHIFT_JIS:
				1038	return("Shift-JIS");
				1039	case XML_CHAR_ENCODING_EUC_JP:
				1040	return("EUC-JP");
Daniel Veillard	87b9539	2000-08-12 21:12:04 +0000	[diff] [blame^]	1041	case XML_CHAR_ENCODING_ASCII:
				1042	return(NULL);
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1043	}
				1044	return(NULL);
				1045	}
				1046
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1047	/****************************************************************
				1048	* *
				1049	* Char encoding handlers *
				1050	* *
				1051	****************************************************************/
				1052
				1053	/* the size should be growable, but it's not a big deal ... */
				1054	#define MAX_ENCODING_HANDLERS 50
				1055	static xmlCharEncodingHandlerPtr *handlers = NULL;
				1056	static int nbCharEncodingHandler = 0;
				1057
				1058	/*
				1059	* The default is UTF-8 for XML, that's also the default used for the
				1060	* parser internals, so the default encoding handler is NULL
				1061	*/
				1062
				1063	static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
				1064
				1065	/**
				1066	* xmlNewCharEncodingHandler:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	1067	* @name: the encoding name, in UTF-8 format (ASCII actually)
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1068	* @input: the xmlCharEncodingInputFunc to read that encoding
				1069	* @output: the xmlCharEncodingOutputFunc to write that encoding
				1070	*
				1071	* Create and registers an xmlCharEncodingHandler.
				1072	* Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
				1073	*/
				1074	xmlCharEncodingHandlerPtr
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1075	xmlNewCharEncodingHandler(const char *name,
				1076	xmlCharEncodingInputFunc input,
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1077	xmlCharEncodingOutputFunc output) {
				1078	xmlCharEncodingHandlerPtr handler;
				1079	char upper[500];
				1080	int i;
				1081	char *up = 0;
				1082
				1083	/*
				1084	* Keep only the uppercase version of the encoding.
				1085	*/
				1086	if (name == NULL) {
				1087	fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
				1088	return(NULL);
				1089	}
				1090	for (i = 0;i < 499;i++) {
				1091	upper[i] = toupper(name[i]);
				1092	if (upper[i] == 0) break;
				1093	}
				1094	upper[i] = 0;
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1095	up = xmlMemStrdup(upper);
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1096	if (up == NULL) {
				1097	fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
				1098	return(NULL);
				1099	}
				1100
				1101	/*
				1102	* allocate and fill-up an handler block.
				1103	*/
				1104	handler = (xmlCharEncodingHandlerPtr)
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1105	xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1106	if (handler == NULL) {
				1107	fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
				1108	return(NULL);
				1109	}
				1110	handler->input = input;
				1111	handler->output = output;
				1112	handler->name = up;
				1113
Daniel Veillard	87b9539	2000-08-12 21:12:04 +0000	[diff] [blame^]	1114	#ifdef LIBXML_ICONV_ENABLED
				1115	handler->iconv_in = NULL;
				1116	handler->iconv_out = NULL;
				1117	#endif /* LIBXML_ICONV_ENABLED */
				1118
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1119	/*
				1120	* registers and returns the handler.
				1121	*/
				1122	xmlRegisterCharEncodingHandler(handler);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1123	#ifdef DEBUG_ENCODING
				1124	fprintf(stderr, "Registered encoding handler for %s\n", name);
				1125	#endif
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1126	return(handler);
				1127	}
				1128
				1129	/**
				1130	* xmlInitCharEncodingHandlers:
				1131	*
				1132	* Initialize the char encoding support, it registers the default
				1133	* encoding supported.
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	1134	* NOTE: while public, this function usually doesn't need to be called
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1135	* in normal processing.
				1136	*/
				1137	void
				1138	xmlInitCharEncodingHandlers(void) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1139	unsigned short int tst = 0x1234;
				1140	unsigned char ptr = (unsigned char ) &tst;
				1141
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1142	if (handlers != NULL) return;
				1143
				1144	handlers = (xmlCharEncodingHandlerPtr *)
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	1145	xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1146
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1147	if (*ptr == 0x12) xmlLittleEndian = 0;
				1148	else if (*ptr == 0x34) xmlLittleEndian = 1;
				1149	else fprintf(stderr, "Odd problem at endianness detection\n");
				1150
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1151	if (handlers == NULL) {
				1152	fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
				1153	return;
				1154	}
				1155	xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1156	xmlUTF16LEHandler =
				1157	xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
				1158	xmlUTF16BEHandler =
				1159	xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1160	xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1161	xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
				1162	#ifdef LIBXML_HTML_ENABLED
				1163	xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
				1164	#endif
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1165	}
				1166
				1167	/**
Daniel Veillard	a819dac	1999-11-24 18:04:22 +0000	[diff] [blame]	1168	* xmlCleanupCharEncodingHandlers:
				1169	*
				1170	* Cleanup the memory allocated for the char encoding support, it
				1171	* unregisters all the encoding handlers.
				1172	*/
				1173	void
				1174	xmlCleanupCharEncodingHandlers(void) {
				1175	if (handlers == NULL) return;
				1176
				1177	for (;nbCharEncodingHandler > 0;) {
				1178	nbCharEncodingHandler--;
				1179	if (handlers[nbCharEncodingHandler] != NULL) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1180	if (handlers[nbCharEncodingHandler]->name != NULL)
				1181	xmlFree(handlers[nbCharEncodingHandler]->name);
Daniel Veillard	a819dac	1999-11-24 18:04:22 +0000	[diff] [blame]	1182	xmlFree(handlers[nbCharEncodingHandler]);
				1183	}
				1184	}
				1185	xmlFree(handlers);
				1186	handlers = NULL;
				1187	nbCharEncodingHandler = 0;
				1188	xmlDefaultCharEncodingHandler = NULL;
				1189	}
				1190
				1191	/**
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1192	* xmlRegisterCharEncodingHandler:
				1193	* @handler: the xmlCharEncodingHandlerPtr handler block
				1194	*
				1195	* Register the char encoding handler, surprizing, isn't it ?
				1196	*/
				1197	void
				1198	xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
				1199	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1200	if (handler == NULL) {
				1201	fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
				1202	return;
				1203	}
				1204
				1205	if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
				1206	fprintf(stderr,
				1207	"xmlRegisterCharEncodingHandler: Too many handler registered\n");
				1208	fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
				1209	return;
				1210	}
				1211	handlers[nbCharEncodingHandler++] = handler;
				1212	}
				1213
				1214	/**
				1215	* xmlGetCharEncodingHandler:
				1216	* @enc: an xmlCharEncoding value.
				1217	*
				1218	* Search in the registrered set the handler able to read/write that encoding.
				1219	*
				1220	* Returns the handler or NULL if not found
				1221	*/
				1222	xmlCharEncodingHandlerPtr
				1223	xmlGetCharEncodingHandler(xmlCharEncoding enc) {
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1224	xmlCharEncodingHandlerPtr handler;
				1225
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1226	if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1227	switch (enc) {
				1228	case XML_CHAR_ENCODING_ERROR:
				1229	return(NULL);
				1230	case XML_CHAR_ENCODING_NONE:
				1231	return(NULL);
				1232	case XML_CHAR_ENCODING_UTF8:
				1233	return(NULL);
				1234	case XML_CHAR_ENCODING_UTF16LE:
				1235	return(xmlUTF16LEHandler);
				1236	case XML_CHAR_ENCODING_UTF16BE:
				1237	return(xmlUTF16BEHandler);
				1238	case XML_CHAR_ENCODING_EBCDIC:
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1239	handler = xmlFindCharEncodingHandler("EBCDIC");
				1240	if (handler != NULL) return(handler);
				1241	handler = xmlFindCharEncodingHandler("ebcdic");
				1242	if (handler != NULL) return(handler);
				1243	break;
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1244	case XML_CHAR_ENCODING_UCS4BE:
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1245	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1246	if (handler != NULL) return(handler);
				1247	handler = xmlFindCharEncodingHandler("UCS-4");
				1248	if (handler != NULL) return(handler);
				1249	handler = xmlFindCharEncodingHandler("UCS4");
				1250	if (handler != NULL) return(handler);
				1251	break;
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1252	case XML_CHAR_ENCODING_UCS4LE:
				1253	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
				1254	if (handler != NULL) return(handler);
				1255	handler = xmlFindCharEncodingHandler("UCS-4");
				1256	if (handler != NULL) return(handler);
				1257	handler = xmlFindCharEncodingHandler("UCS4");
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1258	if (handler != NULL) return(handler);
				1259	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1260	case XML_CHAR_ENCODING_UCS4_2143:
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1261	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1262	case XML_CHAR_ENCODING_UCS4_3412:
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1263	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1264	case XML_CHAR_ENCODING_UCS2:
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1265	handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
				1266	if (handler != NULL) return(handler);
				1267	handler = xmlFindCharEncodingHandler("UCS-2");
				1268	if (handler != NULL) return(handler);
				1269	handler = xmlFindCharEncodingHandler("UCS2");
				1270	if (handler != NULL) return(handler);
				1271	break;
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1272
				1273	/*
				1274	* We used to keep ISO Latin encodings native in the
				1275	* generated data. This led to so many problems that
				1276	* this has been removed. One can still change this
				1277	* back by registering no-ops encoders for those
				1278	*/
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1279	case XML_CHAR_ENCODING_8859_1:
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1280	handler = xmlFindCharEncodingHandler("ISO-8859-1");
				1281	if (handler != NULL) return(handler);
				1282	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1283	case XML_CHAR_ENCODING_8859_2:
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1284	handler = xmlFindCharEncodingHandler("ISO-8859-2");
				1285	if (handler != NULL) return(handler);
				1286	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1287	case XML_CHAR_ENCODING_8859_3:
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1288	handler = xmlFindCharEncodingHandler("ISO-8859-3");
				1289	if (handler != NULL) return(handler);
				1290	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1291	case XML_CHAR_ENCODING_8859_4:
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1292	handler = xmlFindCharEncodingHandler("ISO-8859-4");
				1293	if (handler != NULL) return(handler);
				1294	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1295	case XML_CHAR_ENCODING_8859_5:
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1296	handler = xmlFindCharEncodingHandler("ISO-8859-5");
				1297	if (handler != NULL) return(handler);
				1298	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1299	case XML_CHAR_ENCODING_8859_6:
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1300	handler = xmlFindCharEncodingHandler("ISO-8859-6");
				1301	if (handler != NULL) return(handler);
				1302	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1303	case XML_CHAR_ENCODING_8859_7:
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1304	handler = xmlFindCharEncodingHandler("ISO-8859-7");
				1305	if (handler != NULL) return(handler);
				1306	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1307	case XML_CHAR_ENCODING_8859_8:
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1308	handler = xmlFindCharEncodingHandler("ISO-8859-8");
				1309	if (handler != NULL) return(handler);
				1310	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1311	case XML_CHAR_ENCODING_8859_9:
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1312	handler = xmlFindCharEncodingHandler("ISO-8859-9");
				1313	if (handler != NULL) return(handler);
				1314	break;
				1315
				1316
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1317	case XML_CHAR_ENCODING_2022_JP:
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1318	handler = xmlFindCharEncodingHandler("ISO-2022-JP");
				1319	if (handler != NULL) return(handler);
				1320	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1321	case XML_CHAR_ENCODING_SHIFT_JIS:
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1322	handler = xmlFindCharEncodingHandler("SHIFT-JIS");
				1323	if (handler != NULL) return(handler);
				1324	handler = xmlFindCharEncodingHandler("SHIFT_JIS");
				1325	if (handler != NULL) return(handler);
				1326	handler = xmlFindCharEncodingHandler("Shift_JIS");
				1327	if (handler != NULL) return(handler);
				1328	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1329	case XML_CHAR_ENCODING_EUC_JP:
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1330	handler = xmlFindCharEncodingHandler("EUC-JP");
				1331	if (handler != NULL) return(handler);
				1332	break;
				1333	default:
				1334	break;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	1335	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1336
				1337	#ifdef DEBUG_ENCODING
				1338	fprintf(stderr, "No handler found for encoding %d\n", enc);
				1339	#endif
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1340	return(NULL);
				1341	}
				1342
				1343	/**
				1344	* xmlGetCharEncodingHandler:
				1345	* @enc: a string describing the char encoding.
				1346	*
				1347	* Search in the registrered set the handler able to read/write that encoding.
				1348	*
				1349	* Returns the handler or NULL if not found
				1350	*/
				1351	xmlCharEncodingHandlerPtr
				1352	xmlFindCharEncodingHandler(const char *name) {
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1353	xmlCharEncoding alias;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1354	#ifdef LIBXML_ICONV_ENABLED
Daniel Veillard	3f6f7f6	2000-06-30 17:58:25 +0000	[diff] [blame]	1355	xmlCharEncodingHandlerPtr enc;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1356	iconv_t icv_in, icv_out;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1357	#endif /* LIBXML_ICONV_ENABLED */
				1358	char upper[100];
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1359	int i;
				1360
				1361	if (handlers == NULL) xmlInitCharEncodingHandlers();
				1362	if (name == NULL) return(xmlDefaultCharEncodingHandler);
				1363	if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
				1364
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1365	/*
				1366	* Check first for directly registered encoding names
				1367	*/
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1368	for (i = 0;i < 99;i++) {
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1369	upper[i] = toupper(name[i]);
				1370	if (upper[i] == 0) break;
				1371	}
				1372	upper[i] = 0;
				1373
				1374	for (i = 0;i < nbCharEncodingHandler; i++)
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1375	if (!strcmp(upper, handlers[i]->name)) {
				1376	#ifdef DEBUG_ENCODING
				1377	fprintf(stderr, "Found registered handler for encoding %s\n", name);
				1378	#endif
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1379	return(handlers[i]);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1380	}
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1381
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1382	#ifdef LIBXML_ICONV_ENABLED
				1383	/* check whether iconv can handle this */
				1384	icv_in = iconv_open("UTF-8", name);
				1385	icv_out = iconv_open(name, "UTF-8");
				1386	if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1387	enc = (xmlCharEncodingHandlerPtr)
				1388	xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1389	if (enc == NULL) {
				1390	iconv_close(icv_in);
				1391	iconv_close(icv_out);
				1392	return(NULL);
				1393	}
Daniel Veillard	365e13b	2000-07-02 07:56:37 +0000	[diff] [blame]	1394	enc->name = xmlMemStrdup(name);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1395	enc->input = NULL;
				1396	enc->output = NULL;
				1397	enc->iconv_in = icv_in;
				1398	enc->iconv_out = icv_out;
				1399	#ifdef DEBUG_ENCODING
				1400	fprintf(stderr, "Found iconv handler for encoding %s\n", name);
				1401	#endif
				1402	return enc;
				1403	} else if ((icv_in != (iconv_t) -1) \|\| icv_out != (iconv_t) -1) {
				1404	fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
				1405	}
				1406	#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1407
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1408	#ifdef DEBUG_ENCODING
				1409	fprintf(stderr, "No handler found for encoding %s\n", name);
				1410	#endif
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1411
				1412	/*
				1413	* Fallback using the canonical names
				1414	*/
				1415	alias = xmlParseCharEncoding(name);
				1416	if (alias != XML_CHAR_ENCODING_ERROR) {
				1417	const char* canon;
				1418	canon = xmlGetCharEncodingName(alias);
				1419	if ((canon != NULL) && (strcmp(name, canon))) {
				1420	return(xmlFindCharEncodingHandler(canon));
				1421	}
				1422	}
				1423
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	1424	return(NULL);
				1425	}
				1426
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1427	#ifdef LIBXML_ICONV_ENABLED
				1428	/**
				1429	* xmlIconvWrapper:
				1430	* @cd: iconv converter data structure
				1431	* @out: a pointer to an array of bytes to store the result
				1432	* @outlen: the length of @out
				1433	* @in: a pointer to an array of ISO Latin 1 chars
				1434	* @inlen: the length of @in
				1435	*
				1436	* Returns 0 if success, or
				1437	* -1 by lack of space, or
				1438	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1439	* the result of transformation can't fit into the encoding we want), or
				1440	* -3 if there the last byte can't form a single output char.
				1441	*
				1442	* The value of @inlen after return is the number of octets consumed
				1443	* as the return value is positive, else unpredictiable.
				1444	* The value of @outlen after return is the number of ocetes consumed.
				1445	*/
				1446	static int
				1447	xmlIconvWrapper(iconv_t cd,
				1448	unsigned char out, int outlen,
				1449	const unsigned char in, int inlen) {
				1450
				1451	size_t icv_inlen = inlen, icv_outlen = outlen;
				1452	const char icv_in = (const char ) in;
				1453	char icv_out = (char ) out;
				1454	int ret;
				1455
				1456	ret = iconv(cd,
				1457	&icv_in, &icv_inlen,
				1458	&icv_out, &icv_outlen);
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1459	if (in != NULL) {
				1460	*inlen -= icv_inlen;
				1461	*outlen -= icv_outlen;
				1462	} else {
				1463	*inlen = 0;
				1464	*outlen = 0;
				1465	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1466	if (icv_inlen != 0 \|\| ret == (size_t) -1) {
				1467	#ifdef EILSEQ
				1468	if (errno == EILSEQ) {
				1469	return -2;
				1470	} else
				1471	#endif
				1472	#ifdef E2BIG
				1473	if (errno == E2BIG) {
				1474	return -1;
				1475	} else
				1476	#endif
				1477	#ifdef EINVAL
				1478	if (errno == EINVAL) {
				1479	return -3;
				1480	}
				1481	#endif
				1482	else {
				1483	return -3;
				1484	}
				1485	}
				1486	return 0;
				1487	}
				1488	#endif /* LIBXML_ICONV_ENABLED */
				1489
				1490	/**
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1491	* xmlCharEncFirstLine:
				1492	* @handler: char enconding transformation data structure
				1493	* @out: an xmlBuffer for the output.
				1494	* @in: an xmlBuffer for the input
				1495	*
				1496	* Front-end for the encoding handler input function, but handle only
				1497	* the very first line, i.e. limit itself to 45 chars.
				1498	*
				1499	* Returns the number of byte written if success, or
				1500	* -1 general error
				1501	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1502	* the result of transformation can't fit into the encoding we want), or
				1503	*/
				1504	int
				1505	xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1506	xmlBufferPtr in) {
				1507	int ret = -2;
				1508	int written;
				1509	int toconv;
				1510
				1511	if (handler == NULL) return(-1);
				1512	if (out == NULL) return(-1);
				1513	if (in == NULL) return(-1);
				1514
				1515	written = out->size - out->use;
				1516	toconv = in->use;
				1517	if (toconv * 2 >= written) {
				1518	xmlBufferGrow(out, toconv);
				1519	written = out->size - out->use - 1;
				1520	}
				1521
				1522	/*
				1523	* echo '<?xml version="1.0" encoding="UCS4"?>' \| wc -c => 38
				1524	* 45 chars should be sufficient to reach the end of the encoding
				1525	* decalration without going too far inside the document content.
				1526	*/
				1527	written = 45;
				1528
				1529	if (handler->input != NULL) {
				1530	ret = handler->input(&out->content[out->use], &written,
				1531	in->content, &toconv);
				1532	xmlBufferShrink(in, toconv);
				1533	out->use += written;
				1534	out->content[out->use] = 0;
				1535	}
				1536	#ifdef LIBXML_ICONV_ENABLED
				1537	else if (handler->iconv_in != NULL) {
				1538	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				1539	&written, in->content, &toconv);
				1540	xmlBufferShrink(in, toconv);
				1541	out->use += written;
				1542	out->content[out->use] = 0;
				1543	if (ret == -1) ret = -3;
				1544	}
				1545	#endif /* LIBXML_ICONV_ENABLED */
				1546	#ifdef DEBUG_ENCODING
				1547	switch (ret) {
				1548	case 0:
				1549	fprintf(stderr, "converted %d bytes to %d bytes of input\n",
				1550	toconv, written);
				1551	break;
				1552	case -1:
				1553	fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
				1554	toconv, written, in->use);
				1555	break;
				1556	case -2:
				1557	fprintf(stderr, "input conversion failed due to input error\n");
				1558	break;
				1559	case -3:
				1560	fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
				1561	toconv, written, in->use);
				1562	break;
				1563	default:
				1564	fprintf(stderr,"Unknown input conversion failed %d\n", ret);
				1565	}
				1566	#endif
				1567	/*
				1568	* Ignore when input buffer is not on a boundary
				1569	*/
				1570	if (ret == -3) ret = 0;
				1571	if (ret == -1) ret = 0;
				1572	return(ret);
				1573	}
				1574
				1575	/**
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1576	* xmlCharEncInFunc:
				1577	* @handler: char enconding transformation data structure
				1578	* @out: an xmlBuffer for the output.
				1579	* @in: an xmlBuffer for the input
				1580	*
				1581	* Generic front-end for the encoding handler input function
				1582	*
				1583	* Returns the number of byte written if success, or
				1584	* -1 general error
				1585	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1586	* the result of transformation can't fit into the encoding we want), or
				1587	*/
				1588	int
				1589	xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1590	xmlBufferPtr in) {
				1591	int ret = -2;
				1592	int written;
				1593	int toconv;
				1594
				1595	if (handler == NULL) return(-1);
				1596	if (out == NULL) return(-1);
				1597	if (in == NULL) return(-1);
				1598
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1599	toconv = in->use;
Daniel Veillard	87b9539	2000-08-12 21:12:04 +0000	[diff] [blame^]	1600	if (toconv == 0)
				1601	return(0);
				1602	written = out->size - out->use;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1603	if (toconv * 2 >= written) {
				1604	xmlBufferGrow(out, toconv * 2);
				1605	written = out->size - out->use - 1;
				1606	}
				1607	if (handler->input != NULL) {
				1608	ret = handler->input(&out->content[out->use], &written,
				1609	in->content, &toconv);
				1610	xmlBufferShrink(in, toconv);
				1611	out->use += written;
				1612	out->content[out->use] = 0;
				1613	}
				1614	#ifdef LIBXML_ICONV_ENABLED
				1615	else if (handler->iconv_in != NULL) {
				1616	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
				1617	&written, in->content, &toconv);
				1618	xmlBufferShrink(in, toconv);
				1619	out->use += written;
				1620	out->content[out->use] = 0;
				1621	if (ret == -1) ret = -3;
				1622	}
				1623	#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1624	switch (ret) {
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1625	#ifdef DEBUG_ENCODING
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1626	case 0:
				1627	fprintf(stderr, "converted %d bytes to %d bytes of input\n",
				1628	toconv, written);
				1629	break;
				1630	case -1:
				1631	fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
				1632	toconv, written, in->use);
				1633	break;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1634	case -3:
				1635	fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
				1636	toconv, written, in->use);
				1637	break;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1638	#endif
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1639	case -2:
				1640	fprintf(stderr, "input conversion failed due to input error\n");
				1641	fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				1642	in->content[0], in->content[1],
				1643	in->content[2], in->content[3]);
				1644	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1645	/*
				1646	* Ignore when input buffer is not on a boundary
				1647	*/
				1648	if (ret == -3) ret = 0;
				1649	return(ret);
				1650	}
				1651
				1652	/**
				1653	* xmlCharEncOutFunc:
				1654	* @handler: char enconding transformation data structure
				1655	* @out: an xmlBuffer for the output.
				1656	* @in: an xmlBuffer for the input
				1657	*
				1658	* Generic front-end for the encoding handler output function
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1659	* a first call with @in == NULL has to be made firs to initiate the
				1660	* output in case of non-stateless encoding needing to initiate their
				1661	* state or the output (like the BOM in UTF16).
				1662	* In case of UTF8 sequence conversion errors for the given encoder,
				1663	* the content will be automatically remapped to a CharRef sequence.
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1664	*
				1665	* Returns the number of byte written if success, or
				1666	* -1 general error
				1667	* -2 if the transcoding fails (for *in is not valid utf8 string or
				1668	* the result of transformation can't fit into the encoding we want), or
				1669	*/
				1670	int
				1671	xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
				1672	xmlBufferPtr in) {
				1673	int ret = -2;
				1674	int written;
				1675	int toconv;
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1676	int output = 0;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1677
				1678	if (handler == NULL) return(-1);
				1679	if (out == NULL) return(-1);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1680
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1681	retry:
				1682
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1683	written = out->size - out->use;
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1684
				1685	/*
				1686	* First specific handling of in = NULL, i.e. the initialization call
				1687	*/
				1688	if (in == NULL) {
				1689	toconv = 0;
				1690	if (handler->output != NULL) {
				1691	ret = handler->output(&out->content[out->use], &written,
				1692	NULL, &toconv);
				1693	out->use += written;
				1694	out->content[out->use] = 0;
				1695	}
				1696	#ifdef LIBXML_ICONV_ENABLED
				1697	else if (handler->iconv_out != NULL) {
				1698	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				1699	&written, NULL, &toconv);
				1700	out->use += written;
				1701	out->content[out->use] = 0;
				1702	}
				1703	#endif /* LIBXML_ICONV_ENABLED */
				1704	#ifdef DEBUG_ENCODING
				1705	fprintf(stderr, "initialized encoder\n");
				1706	#endif
				1707	return(0);
				1708	}
				1709
				1710	/*
				1711	* Convertion itself.
				1712	*/
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1713	toconv = in->use;
Daniel Veillard	87b9539	2000-08-12 21:12:04 +0000	[diff] [blame^]	1714	if (toconv == 0)
				1715	return(0);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1716	if (toconv * 2 >= written) {
				1717	xmlBufferGrow(out, toconv * 2);
				1718	written = out->size - out->use - 1;
				1719	}
				1720	if (handler->output != NULL) {
				1721	ret = handler->output(&out->content[out->use], &written,
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1722	in->content, &toconv);
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1723	xmlBufferShrink(in, toconv);
				1724	out->use += written;
				1725	out->content[out->use] = 0;
				1726	}
				1727	#ifdef LIBXML_ICONV_ENABLED
				1728	else if (handler->iconv_out != NULL) {
				1729	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
				1730	&written, in->content, &toconv);
				1731	xmlBufferShrink(in, toconv);
				1732	out->use += written;
				1733	out->content[out->use] = 0;
				1734	if (ret == -1) ret = -3;
				1735	}
				1736	#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1737	else {
				1738	fprintf(stderr, "xmlCharEncOutFunc: no output function !\n");
				1739	return(-1);
				1740	}
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1741
				1742	if (ret >= 0) output += ret;
				1743
				1744	/*
				1745	* Attempt to handle error cases
				1746	*/
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1747	switch (ret) {
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1748	#ifdef DEBUG_ENCODING
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1749	case 0:
				1750	fprintf(stderr, "converted %d bytes to %d bytes of output\n",
				1751	toconv, written);
				1752	break;
				1753	case -1:
				1754	fprintf(stderr, "output conversion failed by lack of space\n");
				1755	break;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1756	case -3:
				1757	fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
				1758	toconv, written, in->use);
				1759	break;
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1760	#endif
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1761	case -2: {
				1762	int len = in->use;
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1763	const xmlChar utf = (const xmlChar ) in->content;
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1764	int cur;
				1765
				1766	cur = xmlGetUTF8Char(utf, &len);
				1767	if (cur > 0) {
				1768	xmlChar charref[20];
				1769
				1770	#ifdef DEBUG_ENCODING
				1771	fprintf(stderr, "handling output conversion error\n");
				1772	fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				1773	in->content[0], in->content[1],
				1774	in->content[2], in->content[3]);
				1775	#endif
				1776	/*
				1777	* Removes the UTF8 sequence, and replace it by a charref
				1778	* and continue the transcoding phase, hoping the error
				1779	* did not mangle the encoder state.
				1780	*/
Daniel Veillard	32bc74e	2000-07-14 14:49:25 +0000	[diff] [blame]	1781	sprintf((char *) charref, "&#x%X;", cur);
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1782	xmlBufferShrink(in, len);
				1783	xmlBufferAddHead(in, charref, -1);
				1784
				1785	goto retry;
				1786	} else {
				1787	fprintf(stderr, "output conversion failed due to conv error\n");
				1788	fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
				1789	in->content[0], in->content[1],
				1790	in->content[2], in->content[3]);
Daniel Veillard	87b9539	2000-08-12 21:12:04 +0000	[diff] [blame^]	1791	in->content[0] = ' ';
Daniel Veillard	be80396	2000-06-28 23:40:59 +0000	[diff] [blame]	1792	}
				1793	break;
				1794	}
				1795	}
Daniel Veillard	496a1cf	2000-05-03 14:20:55 +0000	[diff] [blame]	1796	return(ret);
				1797	}
				1798
				1799	/**
				1800	* xmlCharEncCloseFunc:
				1801	* @handler: char enconding transformation data structure
				1802	*
				1803	* Generic front-end for hencoding handler close function
				1804	*
				1805	* Returns 0 if success, or -1 in case of error
				1806	*/
				1807	int
				1808	xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
				1809	int ret = 0;
				1810	if (handler == NULL) return(-1);
				1811	if (handler->name == NULL) return(-1);
				1812	#ifdef LIBXML_ICONV_ENABLED
				1813	/*
				1814	* Iconv handlers can be oused only once, free the whole block.
				1815	* and the associated icon resources.
				1816	*/
				1817	if ((handler->iconv_out != NULL) \|\| (handler->iconv_in != NULL)) {
				1818	if (handler->name != NULL)
				1819	xmlFree(handler->name);
				1820	handler->name = NULL;
				1821	if (handler->iconv_out != NULL) {
				1822	if (iconv_close(handler->iconv_out))
				1823	ret = -1;
				1824	handler->iconv_out = NULL;
				1825	}
				1826	if (handler->iconv_in != NULL) {
				1827	if (iconv_close(handler->iconv_in))
				1828	ret = -1;
				1829	handler->iconv_in = NULL;
				1830	}
				1831	xmlFree(handler);
				1832	}
				1833	#endif /* LIBXML_ICONV_ENABLED */
				1834	#ifdef DEBUG_ENCODING
				1835	if (ret)
				1836	fprintf(stderr, "failed to close the encoding handler\n");
				1837	else
				1838	fprintf(stderr, "closed the encoding handler\n");
				1839
				1840	#endif
				1841	return(ret);
				1842	}
				1843