Blame - encoding.c - fp2-dev/platform/external/libxml2

blob: 420092915edcc1bb446cd535cb0dfd5939b2de98 [file] [log] [blame]

Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	1	/*
				2	* encoding.c : implements the encoding conversion functions needed for XML
				3	*
				4	* Related specs:
				5	* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
				6	* [ISO-10646] UTF-8 and UTF-16 in Annexes
				7	* [ISO-8859-1] ISO Latin-1 characters codes.
				8	* [UNICODE] The Unicode Consortium, "The Unicode Standard --
				9	* Worldwide Character Encoding -- Version 1.0", Addison-
				10	* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
				11	* described in Unicode Technical Report #4.
				12	* [US-ASCII] Coded Character Set--7-bit American Standard Code for
				13	* Information Interchange, ANSI X3.4-1986.
				14	*
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	15	* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	16	*
				17	* See Copyright for the status of this software.
				18	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	19	* Daniel.Veillard@w3.org
				20	*/
				21
Daniel Veillard	3c558c3	1999-12-22 11:30:41 +0000	[diff] [blame]	22	#ifdef WIN32
				23	#include "win32config.h"
				24	#else
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	25	#include "config.h"
Daniel Veillard	7f7d111	1999-09-22 09:46:25 +0000	[diff] [blame]	26	#endif
				27
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	28	#include <stdio.h>
Daniel Veillard	7f7d111	1999-09-22 09:46:25 +0000	[diff] [blame]	29	#include <string.h>
				30
				31	#ifdef HAVE_CTYPE_H
				32	#include <ctype.h>
				33	#endif
Daniel Veillard	6d3bf1f	1999-12-16 17:52:19 +0000	[diff] [blame]	34	#ifdef HAVE_STDLIB_H
				35	#include <stdlib.h>
				36	#endif
Daniel Veillard	361d845	2000-04-03 19:48:13 +0000	[diff] [blame]	37	#include <libxml/encoding.h>
				38	#include <libxml/xmlmemory.h>
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	39
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	40	xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
				41	xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Daniel Veillard	b05deb7	1999-08-10 19:04:08 +0000	[diff] [blame]	42
Daniel Veillard	0ba4d53	1998-11-01 19:34:31 +0000	[diff] [blame]	43	/*
				44	* From rfc2044: encoding of the Unicode values on UTF-8:
				45	*
				46	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				47	* 0000 0000-0000 007F 0xxxxxxx
				48	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				49	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				50	*
				51	* I hope we won't use values > 0xFFFF anytime soon !
				52	*/
				53
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	54	/**
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	55	* xmlCheckUTF8: Check utf-8 string for legality.
				56	* @utf: Pointer to putative utf-8 encoded string.
				57	*
				58	* Checks @utf for being valid utf-8. @utf is assumed to be
				59	* null-terminated. This function is not super-strict, as it will
				60	* allow longer utf-8 sequences than necessary. Note that Java is
				61	* capable of producing these sequences if provoked. Also note, this
				62	* routine checks for the 4-byte maxiumum size, but does not check for
				63	* 0x10ffff maximum value.
				64	*
				65	* Return value: true if @utf is valid.
				66	**/
				67	int
				68	xmlCheckUTF8(const unsigned char *utf)
				69	{
				70	int ix;
				71	unsigned char c;
				72
				73	for (ix = 0; (c = utf[ix]);) {
				74	if (c & 0x80) {
				75	if ((utf[ix + 1] & 0xc0) != 0x80)
				76	return(0);
				77	if ((c & 0xe0) == 0xe0) {
				78	if ((utf[ix + 2] & 0xc0) != 0x80)
				79	return(0);
				80	if ((c & 0xf0) == 0xf0) {
				81	if ((c & 0xf8) != 0xf0 \|\| (utf[ix + 3] & 0xc0) != 0x80)
				82	return(0);
				83	ix += 4;
				84	/* 4-byte code */
				85	} else
				86	/* 3-byte code */
				87	ix += 3;
				88	} else
				89	/* 2-byte code */
				90	ix += 2;
				91	} else
				92	/* 1-byte code */
				93	ix++;
				94	}
				95	return(1);
				96	}
				97
				98	/**
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	99	* isolat1ToUTF8:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	100	* @out: a pointer to an array of bytes to store the result
				101	* @outlen: the length of @out
				102	* @in: a pointer to an array of ISO Latin 1 chars
				103	* @inlen: the length of @in
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	104	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	105	* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
				106	* block of chars out.
Daniel Veillard	1e346af	1999-02-22 10:33:01 +0000	[diff] [blame]	107	* Returns the number of byte written, or -1 by lack of space.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	108	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	109	int
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	110	isolat1ToUTF8(unsigned char* out, int outlen,
				111	const unsigned char* in, int *inlen) {
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	112	unsigned char* outstart= out;
				113	unsigned char* outend= out+outlen;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	114	const unsigned char* inend= in+*inlen;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	115	unsigned char c;
				116
				117	while (in < inend) {
				118	c= *in++;
				119	if (c < 0x80) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	120	if (out >= outend) return(-1);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	121	*out++ = c;
				122	}
				123	else {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	124	if (out >= outend) return(-1);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	125	*out++ = 0xC0 \| (c >> 6);
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	126	if (out >= outend) return(-1);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	127	*out++ = 0x80 \| (0x3F & c);
				128	}
				129	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	130	return(out-outstart);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	131	}
				132
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	133	/**
				134	* UTF8Toisolat1:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	135	* @out: a pointer to an array of bytes to store the result
				136	* @outlen: the length of @out
				137	* @in: a pointer to an array of UTF-8 chars
				138	* @inlen: the length of @in
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	139	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	140	* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
				141	* block of chars out.
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	142	* TODO: UTF8Toisolat1 need a fallback mechanism ...
				143	*
Daniel Veillard	1e346af	1999-02-22 10:33:01 +0000	[diff] [blame]	144	* Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	145	* if the transcoding fails (for *in is not valid utf8 string or
				146	* the result of transformation can't fit into the encoding we want)
				147	* The value of @inlen after return is the number of octets consumed
				148	* as the return value is positive, else unpredictiable.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	149	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	150	int
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	151	UTF8Toisolat1(unsigned char* out, int outlen,
				152	const unsigned char* in, int *inlen) {
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	153	unsigned char* outstart= out;
				154	unsigned char* outend= out+outlen;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	155	const unsigned char* inend= in+*inlen;
Daniel Veillard	ccb0963	1998-10-27 06:21:04 +0000	[diff] [blame]	156	unsigned char c;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	157
				158	while (in < inend) {
				159	c= *in++;
				160	if (c < 0x80) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	161	if (out >= outend) return(-1);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	162	*out++= c;
				163	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	164	else if (in == inend) {
				165	*inlen -= 1;
				166	break;
				167	}
				168	else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
				169	/* a two byte utf-8 and can be encoding as isolate1 */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	170	out++= ((c & 0x03) << 6) \| (in++ & 0x3F);
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	171	}
				172	else
				173	return(-2);
				174	/* TODO : some should be represent as "&#x____;" */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	175	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	176	return(out-outstart);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	177	}
				178
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	179	/**
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	180	* UTF16LEToUTF8:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	181	* @out: a pointer to an array of bytes to store the result
				182	* @outlen: the length of @out
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	183	* @inb: a pointer to an array of UTF-16LE passwd as a byte array
				184	* @inlenb: the length of @in in UTF-16LE chars
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	185	*
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	186	* Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
				187	* block of chars out. This function assume the endian properity
				188	* is the same between the native type of this machine and the
				189	* inputed one.
				190	*
				191	* Returns the number of byte written, or -1 by lack of space, or -2
				192	* if the transcoding fails (for *in is not valid utf16 string)
				193	* The value of *inlen after return is the number of octets consumed
				194	* as the return value is positive, else unpredictiable.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	195	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	196	int
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	197	UTF16LEToUTF8(unsigned char* out, int outlen,
				198	const unsigned char* inb, int *inlenb)
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	199	{
				200	unsigned char* outstart= out;
				201	unsigned char* outend= out+outlen;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	202	unsigned short* in = (unsigned short*) inb;
				203	unsigned short* inend;
				204	unsigned int c, d, inlen;
				205	unsigned char *tmp;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	206	int bits;
				207
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	208	if ((*inlenb % 2) == 1)
				209	(*inlenb)--;
				210	inlen = *inlenb / 2;
				211	inend= in + inlen;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	212	while (in < inend) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	213	#ifdef BIG_ENDIAN
				214	tmp = (unsigned char *) in;
				215	c = *tmp++;
				216	c = c \| (((unsigned int)*tmp) << 8);
				217	in++;
				218	#else /* BIG_ENDIAN */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	219	c= *in++;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	220	#endif /* BIG_ENDIAN */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	221	if ((c & 0xFC00) == 0xD800) { /* surrogates */
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	222	if (in >= inend) { /* (in > inend) shouldn't happens */
				223	(*inlenb) -= 2;
				224	break;
				225	}
				226	#ifdef BIG_ENDIAN
				227	tmp = (unsigned char *) in;
				228	d = *tmp++;
				229	d = d \| (((unsigned int)*tmp) << 8);
				230	in++;
				231	#else /* BIG_ENDIAN */
				232	d = *in++;
				233	#endif /* BIG_ENDIAN */
				234	if ((d & 0xFC00) == 0xDC00) {
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	235	c &= 0x03FF;
				236	c <<= 10;
				237	c \|= d & 0x03FF;
				238	c += 0x10000;
				239	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	240	else
				241	return(-2);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	242	}
				243
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	244	/* assertion: c is a single UTF-4 value */
				245	if (out >= outend)
				246	return(-1);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	247	if (c < 0x80) { *out++= c; bits= -6; }
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	248	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				249	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				250	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	251
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	252	for ( ; bits >= 0; bits-= 6) {
				253	if (out >= outend)
				254	return(-1);
				255	*out++= ((c >> bits) & 0x3F) \| 0x80;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	256	}
				257	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	258	return(out-outstart);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	259	}
				260
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	261	/**
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	262	* UTF8ToUTF16LE:
				263	* @outb: a pointer to an array of bytes to store the result
				264	* @outlen: the length of @outb
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	265	* @in: a pointer to an array of UTF-8 chars
				266	* @inlen: the length of @in
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	267	*
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	268	* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	269	* block of chars out.
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	270	* TODO: UTF8ToUTF16LE need a fallback mechanism ...
Daniel Veillard	b96e643	1999-08-29 21:02:19 +0000	[diff] [blame]	271	*
Daniel Veillard	1e346af	1999-02-22 10:33:01 +0000	[diff] [blame]	272	* Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	273	* if the transcoding failed.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	274	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	275	int
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	276	UTF8ToUTF16LE(unsigned char* outb, int outlen,
				277	const unsigned char* in, int *inlen)
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	278	{
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	279	unsigned short* out = (unsigned short*) outb;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	280	unsigned short* outstart= out;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	281	unsigned short* outend;
				282	const unsigned char* inend= in+*inlen;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	283	unsigned int c, d, trailing;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	284	#ifdef BIG_ENDIAN
				285	unsigned char *tmp;
				286	unsigned short tmp1, tmp2;
				287	#endif /* BIG_ENDIAN */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	288
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	289	outlen /= 2; /* convert in short length */
				290	outend = out + outlen;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	291	while (in < inend) {
				292	d= *in++;
				293	if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	294	else if (d < 0xC0)
				295	return(-2); /* trailing byte in leading position */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	296	else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				297	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				298	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	299	else
				300	return(-2); /* no chance for this in UTF-16 */
				301
				302	if (inend - in < trailing) {
				303	*inlen -= (inend - in);
				304	break;
				305	}
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	306
				307	for ( ; trailing; trailing--) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	308	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
				309	return(-1);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	310	c <<= 6;
				311	c \|= d & 0x3F;
				312	}
				313
				314	/* assertion: c is a single UTF-4 value */
				315	if (c < 0x10000) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	316	if (out >= outend)
				317	return(-1);
				318	#ifdef BIG_ENDIAN
				319	tmp = (unsigned char *) out;
				320	*tmp = c ;
				321	*(tmp + 1) = c >> 8 ;
				322	out++;
				323	#else /* BIG_ENDIAN */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	324	*out++ = c;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	325	#endif /* BIG_ENDIAN */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	326	}
				327	else if (c < 0x110000) {
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	328	if (out+1 >= outend)
				329	return(-1);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	330	c -= 0x10000;
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	331	#ifdef BIG_ENDIAN
				332	tmp1 = 0xD800 \| (c >> 10);
				333	tmp = (unsigned char *) out;
				334	*tmp = tmp1;
				335	*(tmp + 1) = tmp1 >> 8;
				336	out++;
				337
				338	tmp2 = 0xDC00 \| (c & 0x03FF);
				339	tmp = (unsigned char *) out;
				340	*tmp = tmp2;
				341	*(tmp + 1) = tmp2 >> 8;
				342	out++;
				343	#else /* BIG_ENDIAN */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	344	*out++ = 0xD800 \| (c >> 10);
				345	*out++ = 0xDC00 \| (c & 0x03FF);
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	346	#endif /* BIG_ENDIAN */
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	347	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	348	else
				349	return(-1);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	350	}
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	351	return(out-outstart);
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	352	}
				353
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	354	/**
				355	* UTF16BEToUTF8:
				356	* @out: a pointer to an array of bytes to store the result
				357	* @outlen: the length of @out
				358	* @inb: a pointer to an array of UTF-16 passwd as a byte array
				359	* @inlenb: the length of @in in UTF-16 chars
				360	*
				361	* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
				362	* block of chars out. This function assume the endian properity
				363	* is the same between the native type of this machine and the
				364	* inputed one.
				365	*
				366	* Returns the number of byte written, or -1 by lack of space, or -2
				367	* if the transcoding fails (for *in is not valid utf16 string)
				368	* The value of *inlen after return is the number of octets consumed
				369	* as the return value is positive, else unpredictiable.
				370	*/
				371	int
				372	UTF16BEToUTF8(unsigned char* out, int outlen,
				373	const unsigned char* inb, int *inlenb)
				374	{
				375	unsigned char* outstart= out;
				376	unsigned char* outend= out+outlen;
				377	unsigned short* in = (unsigned short*) inb;
				378	unsigned short* inend;
				379	unsigned int c, d, inlen;
				380	#ifdef BIG_ENDIAN
				381	#else /* BIG_ENDIAN */
				382	unsigned char *tmp;
				383	#endif /* BIG_ENDIAN */
				384	int bits;
				385
				386	if ((*inlenb % 2) == 1)
				387	(*inlenb)--;
				388	inlen = *inlenb / 2;
				389	inend= in + inlen;
				390	while (in < inend) {
				391	#ifdef BIG_ENDIAN
				392	c= *in++;
				393	#else
				394	tmp = (unsigned char *) in;
				395	c = *tmp++;
				396	c = c << 8;
				397	c = c \| (unsigned int) *tmp;
				398	in++;
				399	#endif
				400	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				401	if (in >= inend) { /* (in > inend) shouldn't happens */
				402	(*inlenb) -= 2;
				403	break;
				404	}
				405
				406	#ifdef BIG_ENDIAN
				407	d= *in++;
				408	#else
				409	tmp = (unsigned char *) in;
				410	d = *tmp++;
				411	d = d << 8;
				412	d = d \| (unsigned int) *tmp;
				413	in++;
				414	#endif
				415	if ((d & 0xFC00) == 0xDC00) {
				416	c &= 0x03FF;
				417	c <<= 10;
				418	c \|= d & 0x03FF;
				419	c += 0x10000;
				420	}
				421	else
				422	return(-2);
				423	}
				424
				425	/* assertion: c is a single UTF-4 value */
				426	if (out >= outend)
				427	return(-1);
				428	if (c < 0x80) { *out++= c; bits= -6; }
				429	else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
				430	else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
				431	else { *out++= ((c >> 18) & 0x07) \| 0xF0; bits= 12; }
				432
				433	for ( ; bits >= 0; bits-= 6) {
				434	if (out >= outend)
				435	return(-1);
				436	*out++= ((c >> bits) & 0x3F) \| 0x80;
				437	}
				438	}
				439	return(out-outstart);
				440	}
				441
				442	/**
				443	* UTF8ToUTF16BE:
				444	* @outb: a pointer to an array of bytes to store the result
				445	* @outlen: the length of @outb
				446	* @in: a pointer to an array of UTF-8 chars
				447	* @inlen: the length of @in
				448	*
				449	* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
				450	* block of chars out.
				451	* TODO: UTF8ToUTF16BE need a fallback mechanism ...
				452	*
				453	* Returns the number of byte written, or -1 by lack of space, or -2
				454	* if the transcoding failed.
				455	*/
				456	int
				457	UTF8ToUTF16BE(unsigned char* outb, int outlen,
				458	const unsigned char* in, int *inlen)
				459	{
				460	unsigned short* out = (unsigned short*) outb;
				461	unsigned short* outstart= out;
				462	unsigned short* outend;
				463	const unsigned char* inend= in+*inlen;
				464	unsigned int c, d, trailing;
				465	#ifdef BIG_ENDIAN
				466	#else
				467	unsigned char *tmp;
				468	unsigned short tmp1, tmp2;
				469	#endif /* BIG_ENDIAN */
				470
				471	outlen /= 2; /* convert in short length */
				472	outend = out + outlen;
				473	while (in < inend) {
				474	d= *in++;
				475	if (d < 0x80) { c= d; trailing= 0; }
				476	else if (d < 0xC0)
				477	return(-2); /* trailing byte in leading position */
				478	else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				479	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				480	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				481	else
				482	return(-2); /* no chance for this in UTF-16 */
				483
				484	if (inend - in < trailing) {
				485	*inlen -= (inend - in);
				486	break;
				487	}
				488
				489	for ( ; trailing; trailing--) {
				490	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80)) return(-1);
				491	c <<= 6;
				492	c \|= d & 0x3F;
				493	}
				494
				495	/* assertion: c is a single UTF-4 value */
				496	if (c < 0x10000) {
				497	if (out >= outend) return(-1);
				498	#ifdef BIG_ENDIAN
				499	*out++ = c;
				500	#else
				501	tmp = (unsigned char *) out;
				502	*tmp = c >> 8;
				503	*(tmp + 1) = c;
				504	out++;
				505	#endif /* BIG_ENDIAN */
				506	}
				507	else if (c < 0x110000) {
				508	if (out+1 >= outend) return(-1);
				509	c -= 0x10000;
				510	#ifdef BIG_ENDIAN
				511	*out++ = 0xD800 \| (c >> 10);
				512	*out++ = 0xDC00 \| (c & 0x03FF);
				513	#else
				514	tmp1 = 0xD800 \| (c >> 10);
				515	tmp = (unsigned char *) out;
				516	*tmp = tmp1 >> 8;
				517	*(tmp + 1) = tmp1;
				518	out++;
				519
				520	tmp2 = 0xDC00 \| (c & 0x03FF);
				521	tmp = (unsigned char *) out;
				522	*tmp = tmp2 >> 8;
				523	*(tmp + 1) = tmp2;
				524	out++;
				525	#endif
				526	}
				527	else return(-1);
				528	}
				529	return(out-outstart);
				530	}
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	531
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	532	/**
				533	* xmlDetectCharEncoding:
				534	* @in: a pointer to the first bytes of the XML entity, must be at least
				535	* 4 bytes long.
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	536	* @len: pointer to the length of the buffer
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	537	*
				538	* Guess the encoding of the entity using the first bytes of the entity content
				539	* accordingly of the non-normative appendix F of the XML-1.0 recommendation.
				540	*
				541	* Returns one of the XML_CHAR_ENCODING_... values.
				542	*/
				543	xmlCharEncoding
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	544	xmlDetectCharEncoding(const unsigned char* in, int len)
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	545	{
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	546	if (len >= 4) {
				547	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				548	(in[2] == 0x00) && (in[3] == 0x3C))
				549	return(XML_CHAR_ENCODING_UCS4BE);
				550	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
				551	(in[2] == 0x00) && (in[3] == 0x00))
				552	return(XML_CHAR_ENCODING_UCS4LE);
				553	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				554	(in[2] == 0x3C) && (in[3] == 0x00))
				555	return(XML_CHAR_ENCODING_UCS4_2143);
				556	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
				557	(in[2] == 0x00) && (in[3] == 0x00))
				558	return(XML_CHAR_ENCODING_UCS4_3412);
				559	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
				560	(in[2] == 0xA7) && (in[3] == 0x94))
				561	return(XML_CHAR_ENCODING_EBCDIC);
				562	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
				563	(in[2] == 0x78) && (in[3] == 0x6D))
				564	return(XML_CHAR_ENCODING_UTF8);
				565	}
				566	if (len >= 2) {
				567	if ((in[0] == 0xFE) && (in[1] == 0xFF))
				568	return(XML_CHAR_ENCODING_UTF16BE);
				569	if ((in[0] == 0xFF) && (in[1] == 0xFE))
				570	return(XML_CHAR_ENCODING_UTF16LE);
				571	}
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	572	return(XML_CHAR_ENCODING_NONE);
				573	}
				574
				575	/**
				576	* xmlParseCharEncoding:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	577	* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	578	*
				579	* Conpare the string to the known encoding schemes already known. Note
				580	* that the comparison is case insensitive accordingly to the section
				581	* [XML] 4.3.3 Character Encoding in Entities.
				582	*
				583	* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
				584	* if not recognized.
				585	*/
				586	xmlCharEncoding
Daniel Veillard	011b63c	1999-06-02 17:44:04 +0000	[diff] [blame]	587	xmlParseCharEncoding(const char* name)
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame]	588	{
				589	char upper[500];
				590	int i;
				591
				592	for (i = 0;i < 499;i++) {
				593	upper[i] = toupper(name[i]);
				594	if (upper[i] == 0) break;
				595	}
				596	upper[i] = 0;
				597
				598	if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
				599	if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
				600	if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
				601
				602	/*
				603	* NOTE: if we were able to parse this, the endianness of UTF16 is
				604	* already found and in use
				605	*/
				606	if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
				607	if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
				608
				609	if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				610	if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				611	if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
				612
				613	/*
				614	* NOTE: if we were able to parse this, the endianness of UCS4 is
				615	* already found and in use
				616	*/
				617	if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				618	if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				619	if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
				620
				621
				622	if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
				623	if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
				624	if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
				625
				626	if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
				627	if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
				628	if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
				629
				630	if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
				631	if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
				632	if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
				633	if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
				634	if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
				635	if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
				636	if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
				637
				638	if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
				639	if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
				640	if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
				641	return(XML_CHAR_ENCODING_ERROR);
				642	}
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	643
				644	/****************************************************************
				645	* *
				646	* Char encoding handlers *
				647	* *
				648	****************************************************************/
				649
				650	/* the size should be growable, but it's not a big deal ... */
				651	#define MAX_ENCODING_HANDLERS 50
				652	static xmlCharEncodingHandlerPtr *handlers = NULL;
				653	static int nbCharEncodingHandler = 0;
				654
				655	/*
				656	* The default is UTF-8 for XML, that's also the default used for the
				657	* parser internals, so the default encoding handler is NULL
				658	*/
				659
				660	static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
				661
				662	/**
				663	* xmlNewCharEncodingHandler:
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	664	* @name: the encoding name, in UTF-8 format (ASCII actually)
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	665	* @input: the xmlCharEncodingInputFunc to read that encoding
				666	* @output: the xmlCharEncodingOutputFunc to write that encoding
				667	*
				668	* Create and registers an xmlCharEncodingHandler.
				669	* Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
				670	*/
				671	xmlCharEncodingHandlerPtr
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	672	xmlNewCharEncodingHandler(const char *name,
				673	xmlCharEncodingInputFunc input,
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	674	xmlCharEncodingOutputFunc output) {
				675	xmlCharEncodingHandlerPtr handler;
				676	char upper[500];
				677	int i;
				678	char *up = 0;
				679
				680	/*
				681	* Keep only the uppercase version of the encoding.
				682	*/
				683	if (name == NULL) {
				684	fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
				685	return(NULL);
				686	}
				687	for (i = 0;i < 499;i++) {
				688	upper[i] = toupper(name[i]);
				689	if (upper[i] == 0) break;
				690	}
				691	upper[i] = 0;
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	692	up = xmlMemStrdup(upper);
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	693	if (up == NULL) {
				694	fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
				695	return(NULL);
				696	}
				697
				698	/*
				699	* allocate and fill-up an handler block.
				700	*/
				701	handler = (xmlCharEncodingHandlerPtr)
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	702	xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	703	if (handler == NULL) {
				704	fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
				705	return(NULL);
				706	}
				707	handler->input = input;
				708	handler->output = output;
				709	handler->name = up;
				710
				711	/*
				712	* registers and returns the handler.
				713	*/
				714	xmlRegisterCharEncodingHandler(handler);
				715	return(handler);
				716	}
				717
				718	/**
				719	* xmlInitCharEncodingHandlers:
				720	*
				721	* Initialize the char encoding support, it registers the default
				722	* encoding supported.
Daniel Veillard	7f85850	1999-11-17 17:32:38 +0000	[diff] [blame]	723	* NOTE: while public, this function usually doesn't need to be called
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	724	* in normal processing.
				725	*/
				726	void
				727	xmlInitCharEncodingHandlers(void) {
				728	if (handlers != NULL) return;
				729
				730	handlers = (xmlCharEncodingHandlerPtr *)
Daniel Veillard	6454aec	1999-09-02 22:04:43 +0000	[diff] [blame]	731	xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	732
				733	if (handlers == NULL) {
				734	fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
				735	return;
				736	}
				737	xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	738	xmlUTF16LEHandler =
				739	xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
				740	xmlUTF16BEHandler =
				741	xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	742	xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
				743	}
				744
				745	/**
Daniel Veillard	a819dac	1999-11-24 18:04:22 +0000	[diff] [blame]	746	* xmlCleanupCharEncodingHandlers:
				747	*
				748	* Cleanup the memory allocated for the char encoding support, it
				749	* unregisters all the encoding handlers.
				750	*/
				751	void
				752	xmlCleanupCharEncodingHandlers(void) {
				753	if (handlers == NULL) return;
				754
				755	for (;nbCharEncodingHandler > 0;) {
				756	nbCharEncodingHandler--;
				757	if (handlers[nbCharEncodingHandler] != NULL) {
				758	xmlFree(handlers[nbCharEncodingHandler]->name);
				759	xmlFree(handlers[nbCharEncodingHandler]);
				760	}
				761	}
				762	xmlFree(handlers);
				763	handlers = NULL;
				764	nbCharEncodingHandler = 0;
				765	xmlDefaultCharEncodingHandler = NULL;
				766	}
				767
				768	/**
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	769	* xmlRegisterCharEncodingHandler:
				770	* @handler: the xmlCharEncodingHandlerPtr handler block
				771	*
				772	* Register the char encoding handler, surprizing, isn't it ?
				773	*/
				774	void
				775	xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
				776	if (handlers == NULL) xmlInitCharEncodingHandlers();
				777	if (handler == NULL) {
				778	fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
				779	return;
				780	}
				781
				782	if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
				783	fprintf(stderr,
				784	"xmlRegisterCharEncodingHandler: Too many handler registered\n");
				785	fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
				786	return;
				787	}
				788	handlers[nbCharEncodingHandler++] = handler;
				789	}
				790
				791	/**
				792	* xmlGetCharEncodingHandler:
				793	* @enc: an xmlCharEncoding value.
				794	*
				795	* Search in the registrered set the handler able to read/write that encoding.
				796	*
				797	* Returns the handler or NULL if not found
				798	*/
				799	xmlCharEncodingHandlerPtr
				800	xmlGetCharEncodingHandler(xmlCharEncoding enc) {
				801	if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillard	cf46199	2000-03-14 18:30:20 +0000	[diff] [blame]	802	switch (enc) {
				803	case XML_CHAR_ENCODING_ERROR:
				804	return(NULL);
				805	case XML_CHAR_ENCODING_NONE:
				806	return(NULL);
				807	case XML_CHAR_ENCODING_UTF8:
				808	return(NULL);
				809	case XML_CHAR_ENCODING_UTF16LE:
				810	return(xmlUTF16LEHandler);
				811	case XML_CHAR_ENCODING_UTF16BE:
				812	return(xmlUTF16BEHandler);
				813	case XML_CHAR_ENCODING_EBCDIC:
				814	return(NULL);
				815	case XML_CHAR_ENCODING_UCS4LE:
				816	return(NULL);
				817	case XML_CHAR_ENCODING_UCS4BE:
				818	return(NULL);
				819	case XML_CHAR_ENCODING_UCS4_2143:
				820	return(NULL);
				821	case XML_CHAR_ENCODING_UCS4_3412:
				822	return(NULL);
				823	case XML_CHAR_ENCODING_UCS2:
				824	return(NULL);
				825	case XML_CHAR_ENCODING_8859_1:
				826	return(NULL);
				827	case XML_CHAR_ENCODING_8859_2:
				828	return(NULL);
				829	case XML_CHAR_ENCODING_8859_3:
				830	return(NULL);
				831	case XML_CHAR_ENCODING_8859_4:
				832	return(NULL);
				833	case XML_CHAR_ENCODING_8859_5:
				834	return(NULL);
				835	case XML_CHAR_ENCODING_8859_6:
				836	return(NULL);
				837	case XML_CHAR_ENCODING_8859_7:
				838	return(NULL);
				839	case XML_CHAR_ENCODING_8859_8:
				840	return(NULL);
				841	case XML_CHAR_ENCODING_8859_9:
				842	return(NULL);
				843	case XML_CHAR_ENCODING_2022_JP:
				844	case XML_CHAR_ENCODING_SHIFT_JIS:
				845	case XML_CHAR_ENCODING_EUC_JP:
				846	return(NULL);
				847	}
Daniel Veillard	14fff06	1999-06-22 21:49:07 +0000	[diff] [blame]	848	return(NULL);
				849	}
				850
				851	/**
				852	* xmlGetCharEncodingHandler:
				853	* @enc: a string describing the char encoding.
				854	*
				855	* Search in the registrered set the handler able to read/write that encoding.
				856	*
				857	* Returns the handler or NULL if not found
				858	*/
				859	xmlCharEncodingHandlerPtr
				860	xmlFindCharEncodingHandler(const char *name) {
				861	char upper[500];
				862	int i;
				863
				864	if (handlers == NULL) xmlInitCharEncodingHandlers();
				865	if (name == NULL) return(xmlDefaultCharEncodingHandler);
				866	if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
				867
				868	for (i = 0;i < 499;i++) {
				869	upper[i] = toupper(name[i]);
				870	if (upper[i] == 0) break;
				871	}
				872	upper[i] = 0;
				873
				874	for (i = 0;i < nbCharEncodingHandler; i++)
				875	if (!strcmp(name, handlers[i]->name))
				876	return(handlers[i]);
				877
				878	return(NULL);
				879	}
				880