Blame - encoding.c - fp2-dev/platform/external/libxml2

blob: a7e984e050d1d5e67e7b2e4ebdde1c8a7727fb1e [file] [log] [blame]

Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	1	/*
				2	* encoding.c : implements the encoding conversion functions needed for XML
				3	*
				4	* Related specs:
				5	* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
				6	* [ISO-10646] UTF-8 and UTF-16 in Annexes
				7	* [ISO-8859-1] ISO Latin-1 characters codes.
				8	* [UNICODE] The Unicode Consortium, "The Unicode Standard --
				9	* Worldwide Character Encoding -- Version 1.0", Addison-
				10	* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
				11	* described in Unicode Technical Report #4.
				12	* [US-ASCII] Coded Character Set--7-bit American Standard Code for
				13	* Information Interchange, ANSI X3.4-1986.
				14	*
				15	* Original code from "Martin J. Duerst" <duerst@w3.org>
				16	*
				17	* See Copyright for the status of this software.
				18	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	19	* Daniel.Veillard@w3.org
				20	*/
				21
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame^]	22	#include <ctype.h>
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	23	#include "encoding.h"
				24
Daniel Veillard	0ba4d53	1998-11-01 19:34:31 +0000	[diff] [blame]	25	/*
				26	* From rfc2044: encoding of the Unicode values on UTF-8:
				27	*
				28	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
				29	* 0000 0000-0000 007F 0xxxxxxx
				30	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
				31	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
				32	*
				33	* I hope we won't use values > 0xFFFF anytime soon !
				34	*/
				35
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	36	/**
				37	* isolat1ToUTF8:
				38	* @out: a pointer ot an array of bytes to store the result
				39	* @outlen: the lenght of @out
				40	* @in: a pointer ot an array of ISO Latin 1 chars
				41	* @inlen: the lenght of @in
				42	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	43	* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
				44	* block of chars out.
Daniel Veillard	1e346af	1999-02-22 10:33:01 +0000	[diff] [blame]	45	* Returns the number of byte written, or -1 by lack of space.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	46	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	47	int
				48	isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	49	{
				50	unsigned char* outstart= out;
				51	unsigned char* outend= out+outlen;
				52	unsigned char* inend= in+inlen;
				53	unsigned char c;
				54
				55	while (in < inend) {
				56	c= *in++;
				57	if (c < 0x80) {
				58	if (out >= outend) return -1;
				59	*out++ = c;
				60	}
				61	else {
				62	if (out >= outend) return -1;
				63	*out++ = 0xC0 \| (c >> 6);
				64	if (out >= outend) return -1;
				65	*out++ = 0x80 \| (0x3F & c);
				66	}
				67	}
				68	return out-outstart;
				69	}
				70
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	71	/**
				72	* UTF8Toisolat1:
				73	* @out: a pointer ot an array of bytes to store the result
				74	* @outlen: the lenght of @out
				75	* @in: a pointer ot an array of UTF-8 chars
				76	* @inlen: the lenght of @in
				77	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	78	* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
				79	* block of chars out.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	80	* TODO: need a fallback mechanism ...
Daniel Veillard	1e346af	1999-02-22 10:33:01 +0000	[diff] [blame]	81	* Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	82	* if the transcoding failed.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	83	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	84	int
				85	UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	86	{
				87	unsigned char* outstart= out;
				88	unsigned char* outend= out+outlen;
				89	unsigned char* inend= in+inlen;
Daniel Veillard	ccb0963	1998-10-27 06:21:04 +0000	[diff] [blame]	90	unsigned char c;
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	91
				92	while (in < inend) {
				93	c= *in++;
				94	if (c < 0x80) {
				95	if (out >= outend) return -1;
				96	*out++= c;
				97	}
				98	else if (((c & 0xFE) == 0xC2) && in<inend) {
				99	if (out >= outend) return -1;
				100	out++= ((c & 0x03) << 6) \| (in++ & 0x3F);
				101	}
				102	else return -2;
				103	}
				104	return out-outstart;
				105	}
				106
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	107	/**
				108	* UTF16ToUTF8:
				109	* @out: a pointer ot an array of bytes to store the result
				110	* @outlen: the lenght of @out
				111	* @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
				112	* @inlen: the lenght of @in
				113	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	114	* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
				115	* block of chars out.
Daniel Veillard	1e346af	1999-02-22 10:33:01 +0000	[diff] [blame]	116	* Returns the number of byte written, or -1 by lack of space.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	117	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	118	int
				119	UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	120	{
				121	unsigned char* outstart= out;
				122	unsigned char* outend= out+outlen;
				123	unsigned short* inend= in+inlen;
				124	unsigned int c, d;
				125	int bits;
				126
				127	while (in < inend) {
				128	c= *in++;
				129	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				130	if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
				131	c &= 0x03FF;
				132	c <<= 10;
				133	c \|= d & 0x03FF;
				134	c += 0x10000;
				135	}
				136	else return -1;
				137	}
				138
				139	/* assertion: c is a single UTF-4 value */
				140
				141	if (out >= outend) return -1;
				142	if (c < 0x80) { *out++= c; bits= -6; }
				143	else if (c < 0x800) { *out++= (c >> 6) \| 0xC0; bits= 0; }
				144	else if (c < 0x10000) { *out++= (c >> 12) \| 0xE0; bits= 6; }
				145	else { *out++= (c >> 18) \| 0xF0; bits= 12; }
				146
				147	for ( ; bits < 0; bits-= 6) {
				148	if (out >= outend) return -1;
				149	*out++= (c >> bits) & 0x3F;
				150	}
				151	}
				152	return out-outstart;
				153	}
				154
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	155	/**
				156	* UTF8ToUTF16:
				157	* @out: a pointer ot an array of shorts to store the result
				158	* @outlen: the lenght of @out (number of shorts)
				159	* @in: a pointer ot an array of UTF-8 chars
				160	* @inlen: the lenght of @in
				161	*
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	162	* Take a block of UTF-8 chars in and try to convert it to an UTF-16
				163	* block of chars out.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	164	* TODO: need a fallback mechanism ...
Daniel Veillard	1e346af	1999-02-22 10:33:01 +0000	[diff] [blame]	165	* Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	166	* if the transcoding failed.
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	167	*/
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	168	int
				169	UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame]	170	{
				171	unsigned short* outstart= out;
				172	unsigned short* outend= out+outlen;
				173	unsigned char* inend= in+inlen;
				174	unsigned int c, d, trailing;
				175
				176	while (in < inend) {
				177	d= *in++;
				178	if (d < 0x80) { c= d; trailing= 0; }
				179	else if (d < 0xC0) return -2; /* trailing byte in leading position */
				180	else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				181	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				182	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				183	else return -2; /* no chance for this in UTF-16 */
				184
				185	for ( ; trailing; trailing--) {
				186	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80)) return -1;
				187	c <<= 6;
				188	c \|= d & 0x3F;
				189	}
				190
				191	/* assertion: c is a single UTF-4 value */
				192	if (c < 0x10000) {
				193	if (out >= outend) return -1;
				194	*out++ = c;
				195	}
				196	else if (c < 0x110000) {
				197	if (out+1 >= outend) return -1;
				198	c -= 0x10000;
				199	*out++ = 0xD800 \| (c >> 10);
				200	*out++ = 0xDC00 \| (c & 0x03FF);
				201	}
				202	else return -1;
				203	}
				204	return out-outstart;
				205	}
				206
Daniel Veillard	97b5877	1998-10-20 06:14:16 +0000	[diff] [blame]	207
Daniel Veillard	27d8874	1999-05-29 11:51:49 +0000	[diff] [blame^]	208	/**
				209	* xmlDetectCharEncoding:
				210	* @in: a pointer to the first bytes of the XML entity, must be at least
				211	* 4 bytes long.
				212	*
				213	* Guess the encoding of the entity using the first bytes of the entity content
				214	* accordingly of the non-normative appendix F of the XML-1.0 recommendation.
				215	*
				216	* Returns one of the XML_CHAR_ENCODING_... values.
				217	*/
				218	xmlCharEncoding
				219	xmlDetectCharEncoding(unsigned char* in)
				220	{
				221	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				222	(in[2] == 0x00) && (in[3] == 0x3C))
				223	return(XML_CHAR_ENCODING_UCS4BE);
				224	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
				225	(in[2] == 0x00) && (in[3] == 0x00))
				226	return(XML_CHAR_ENCODING_UCS4LE);
				227	if ((in[0] == 0x00) && (in[1] == 0x00) &&
				228	(in[2] == 0x3C) && (in[3] == 0x00))
				229	return(XML_CHAR_ENCODING_UCS4_2143);
				230	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
				231	(in[2] == 0x00) && (in[3] == 0x00))
				232	return(XML_CHAR_ENCODING_UCS4_3412);
				233	if ((in[0] == 0xFE) && (in[1] == 0xFF))
				234	return(XML_CHAR_ENCODING_UTF16BE);
				235	if ((in[0] == 0xFF) && (in[1] == 0xFE))
				236	return(XML_CHAR_ENCODING_UTF16LE);
				237	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
				238	(in[2] == 0xA7) && (in[3] == 0x94))
				239	return(XML_CHAR_ENCODING_EBCDIC);
				240	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
				241	(in[2] == 0x78) && (in[3] == 0x6D))
				242	return(XML_CHAR_ENCODING_UTF8);
				243	return(XML_CHAR_ENCODING_NONE);
				244	}
				245
				246	/**
				247	* xmlParseCharEncoding:
				248	* @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
				249	*
				250	* Conpare the string to the known encoding schemes already known. Note
				251	* that the comparison is case insensitive accordingly to the section
				252	* [XML] 4.3.3 Character Encoding in Entities.
				253	*
				254	* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
				255	* if not recognized.
				256	*/
				257	xmlCharEncoding
				258	xmlParseCharEncoding(char* name)
				259	{
				260	char upper[500];
				261	int i;
				262
				263	for (i = 0;i < 499;i++) {
				264	upper[i] = toupper(name[i]);
				265	if (upper[i] == 0) break;
				266	}
				267	upper[i] = 0;
				268
				269	if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
				270	if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
				271	if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
				272
				273	/*
				274	* NOTE: if we were able to parse this, the endianness of UTF16 is
				275	* already found and in use
				276	*/
				277	if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
				278	if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
				279
				280	if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				281	if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
				282	if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
				283
				284	/*
				285	* NOTE: if we were able to parse this, the endianness of UCS4 is
				286	* already found and in use
				287	*/
				288	if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				289	if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
				290	if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
				291
				292
				293	if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
				294	if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
				295	if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
				296
				297	if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
				298	if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
				299	if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
				300
				301	if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
				302	if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
				303	if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
				304	if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
				305	if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
				306	if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
				307	if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
				308
				309	if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
				310	if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
				311	if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
				312	return(XML_CHAR_ENCODING_ERROR);
				313	}