Blame - encoding.c - fp2-dev/platform/external/libxml2

blob: 033e45e27c410b1e2f2606992f598ded96efddf4 [file] [log] [blame]

Daniel Veillard	891e404	1998-10-19 00:43:02 +0000	[diff] [blame^]	1	/*
				2	* encoding.c : implements the encoding conversion functions needed for XML
				3	*
				4	* Related specs:
				5	* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
				6	* [ISO-10646] UTF-8 and UTF-16 in Annexes
				7	* [ISO-8859-1] ISO Latin-1 characters codes.
				8	* [UNICODE] The Unicode Consortium, "The Unicode Standard --
				9	* Worldwide Character Encoding -- Version 1.0", Addison-
				10	* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
				11	* described in Unicode Technical Report #4.
				12	* [US-ASCII] Coded Character Set--7-bit American Standard Code for
				13	* Information Interchange, ANSI X3.4-1986.
				14	*
				15	* Original code from "Martin J. Duerst" <duerst@w3.org>
				16	*
				17	* See Copyright for the status of this software.
				18	*
				19	* $Id$
				20	*
				21	* Daniel.Veillard@w3.org
				22	*/
				23
				24	#include "encoding.h"
				25
				26	/*
				27	* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
				28	* block of chars out.
				29	*
				30	* Returns the number of byte written, or -1 by lack of space.
				31	*/
				32	int isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
				33	{
				34	unsigned char* outstart= out;
				35	unsigned char* outend= out+outlen;
				36	unsigned char* inend= in+inlen;
				37	unsigned char c;
				38
				39	while (in < inend) {
				40	c= *in++;
				41	if (c < 0x80) {
				42	if (out >= outend) return -1;
				43	*out++ = c;
				44	}
				45	else {
				46	if (out >= outend) return -1;
				47	*out++ = 0xC0 \| (c >> 6);
				48	if (out >= outend) return -1;
				49	*out++ = 0x80 \| (0x3F & c);
				50	}
				51	}
				52	return out-outstart;
				53	}
				54
				55
				56	/*
				57	* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
				58	* block of chars out.
				59	*
				60	* Returns the number of byte written, or -1 by lack of space, or -2
				61	* if the transcoding failed.
				62	*
				63	* TODO: need a fallback mechanism ...
				64	*/
				65	int UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
				66	{
				67	unsigned char* outstart= out;
				68	unsigned char* outend= out+outlen;
				69	unsigned char* inend= in+inlen;
				70	unsigned char c, d;
				71
				72	while (in < inend) {
				73	c= *in++;
				74	if (c < 0x80) {
				75	if (out >= outend) return -1;
				76	*out++= c;
				77	}
				78	else if (((c & 0xFE) == 0xC2) && in<inend) {
				79	if (out >= outend) return -1;
				80	out++= ((c & 0x03) << 6) \| (in++ & 0x3F);
				81	}
				82	else return -2;
				83	}
				84	return out-outstart;
				85	}
				86
				87	/*
				88	* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
				89	* block of chars out.
				90	*
				91	* Returns the number of byte written, or -1 by lack of space.
				92	*/
				93	int UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
				94	{
				95	unsigned char* outstart= out;
				96	unsigned char* outend= out+outlen;
				97	unsigned short* inend= in+inlen;
				98	unsigned int c, d;
				99	int bits;
				100
				101	while (in < inend) {
				102	c= *in++;
				103	if ((c & 0xFC00) == 0xD800) { /* surrogates */
				104	if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
				105	c &= 0x03FF;
				106	c <<= 10;
				107	c \|= d & 0x03FF;
				108	c += 0x10000;
				109	}
				110	else return -1;
				111	}
				112
				113	/* assertion: c is a single UTF-4 value */
				114
				115	if (out >= outend) return -1;
				116	if (c < 0x80) { *out++= c; bits= -6; }
				117	else if (c < 0x800) { *out++= (c >> 6) \| 0xC0; bits= 0; }
				118	else if (c < 0x10000) { *out++= (c >> 12) \| 0xE0; bits= 6; }
				119	else { *out++= (c >> 18) \| 0xF0; bits= 12; }
				120
				121	for ( ; bits < 0; bits-= 6) {
				122	if (out >= outend) return -1;
				123	*out++= (c >> bits) & 0x3F;
				124	}
				125	}
				126	return out-outstart;
				127	}
				128
				129	/*
				130	* Take a block of UTF-8 chars in and try to convert it to an UTF-16
				131	* block of chars out.
				132	*
				133	* Returns the number of byte written, or -1 by lack of space, or -2
				134	* if the transcoding failed.
				135	*
				136	* TODO: need a fallback mechanism ...
				137	*/
				138	int UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
				139	{
				140	unsigned short* outstart= out;
				141	unsigned short* outend= out+outlen;
				142	unsigned char* inend= in+inlen;
				143	unsigned int c, d, trailing;
				144
				145	while (in < inend) {
				146	d= *in++;
				147	if (d < 0x80) { c= d; trailing= 0; }
				148	else if (d < 0xC0) return -2; /* trailing byte in leading position */
				149	else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
				150	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
				151	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
				152	else return -2; /* no chance for this in UTF-16 */
				153
				154	for ( ; trailing; trailing--) {
				155	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80)) return -1;
				156	c <<= 6;
				157	c \|= d & 0x3F;
				158	}
				159
				160	/* assertion: c is a single UTF-4 value */
				161	if (c < 0x10000) {
				162	if (out >= outend) return -1;
				163	*out++ = c;
				164	}
				165	else if (c < 0x110000) {
				166	if (out+1 >= outend) return -1;
				167	c -= 0x10000;
				168	*out++ = 0xD800 \| (c >> 10);
				169	*out++ = 0xDC00 \| (c & 0x03FF);
				170	}
				171	else return -1;
				172	}
				173	return out-outstart;
				174	}
				175