Blame - jdk/src/share/instrument/EncodingSupport.c - platform/libcore

blob: 63b7bbee263ed57a59d85f45b12abd0d16f8cde6 [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* Copyright 2004 Sun Microsystems, Inc. All Rights Reserved.
				3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				4	*
				5	* This code is free software; you can redistribute it and/or modify it
				6	* under the terms of the GNU General Public License version 2 only, as
				7	* published by the Free Software Foundation. Sun designates this
				8	* particular file as subject to the "Classpath" exception as provided
				9	* by Sun in the LICENSE file that accompanied this code.
				10	*
				11	* This code is distributed in the hope that it will be useful, but WITHOUT
				12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				14	* version 2 for more details (a copy is included in the LICENSE file that
				15	* accompanied this code).
				16	*
				17	* You should have received a copy of the GNU General Public License version
				18	* 2 along with this work; if not, write to the Free Software Foundation,
				19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				20	*
				21	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				22	* CA 95054 USA or visit www.sun.com if you need additional information or
				23	* have any questions.
				24	*/
				25
				26
				27	/**
				28	* Determine length of this Standard UTF-8 in Modified UTF-8.
				29	* Validation is done of the basic UTF encoding rules, returns
				30	* length (no change) when errors are detected in the UTF encoding.
				31	*
				32	* Note: Accepts Modified UTF-8 also, no verification on the
				33	* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
				34	*/
				35	int
				36	modifiedUtf8LengthOfUtf8(char* string, int length) {
				37	int new_length;
				38	int i;
				39
				40	new_length = 0;
				41	for ( i = 0 ; i < length ; i++ ) {
				42	unsigned byte;
				43
				44	byte = (unsigned char)string[i];
				45	if ( (byte & 0x80) == 0 ) { /* 1byte encoding */
				46	new_length++;
				47	if ( byte == 0 ) {
				48	new_length++; /* We gain one byte in length on NULL bytes */
				49	}
				50	} else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */
				51	/* Check encoding of following bytes */
				52	if ( (i+1) >= length \|\| (string[i+1] & 0xC0) != 0x80 ) {
				53	break; /* Error condition */
				54	}
				55	i++; /* Skip next byte */
				56	new_length += 2;
				57	} else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */
				58	/* Check encoding of following bytes */
				59	if ( (i+2) >= length \|\| (string[i+1] & 0xC0) != 0x80
				60	\|\| (string[i+2] & 0xC0) != 0x80 ) {
				61	break; /* Error condition */
				62	}
				63	i += 2; /* Skip next two bytes */
				64	new_length += 3;
				65	} else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */
				66	/* Check encoding of following bytes */
				67	if ( (i+3) >= length \|\| (string[i+1] & 0xC0) != 0x80
				68	\|\| (string[i+2] & 0xC0) != 0x80
				69	\|\| (string[i+3] & 0xC0) != 0x80 ) {
				70	break; /* Error condition */
				71	}
				72	i += 3; /* Skip next 3 bytes */
				73	new_length += 6; /* 4byte encoding turns into 2 3byte ones */
				74	} else {
				75	break; /* Error condition */
				76	}
				77	}
				78	if ( i != length ) {
				79	/* Error in finding new length, return old length so no conversion */
				80	/* FIXUP: ERROR_MESSAGE? */
				81	return length;
				82	}
				83	return new_length;
				84	}
				85
				86	/*
				87	* Convert Standard UTF-8 to Modified UTF-8.
				88	* Assumes the UTF-8 encoding was validated by modifiedLength() above.
				89	*
				90	* Note: Accepts Modified UTF-8 also, no verification on the
				91	* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
				92	*/
				93	void
				94	convertUtf8ToModifiedUtf8(char string, int length, char new_string, int new_length)
				95	{
				96	int i;
				97	int j;
				98
				99	j = 0;
				100	for ( i = 0 ; i < length ; i++ ) {
				101	unsigned byte1;
				102
				103	byte1 = (unsigned char)string[i];
				104
				105	/* NULL bytes and bytes starting with 11110xxx are special */
				106	if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
				107	if ( byte1 == 0 ) {
				108	/* Bits out: 11000000 10000000 */
				109	new_string[j++] = (char)0xC0;
				110	new_string[j++] = (char)0x80;
				111	} else {
				112	/* Single byte */
				113	new_string[j++] = byte1;
				114	}
				115	} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
				116	new_string[j++] = byte1;
				117	new_string[j++] = string[++i];
				118	} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
				119	new_string[j++] = byte1;
				120	new_string[j++] = string[++i];
				121	new_string[j++] = string[++i];
				122	} else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */
				123	/* Beginning of 4byte encoding, turn into 2 3byte encodings */
				124	unsigned byte2, byte3, byte4, u21;
				125
				126	/* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
				127	byte2 = (unsigned char)string[++i];
				128	byte3 = (unsigned char)string[++i];
				129	byte4 = (unsigned char)string[++i];
				130	/* Reconstruct full 21bit value */
				131	u21 = (byte1 & 0x07) << 18;
				132	u21 += (byte2 & 0x3F) << 12;
				133	u21 += (byte3 & 0x3F) << 6;
				134	u21 += (byte4 & 0x3F);
				135	/* Bits out: 11101101 1010xxxx 10xxxxxx */
				136	new_string[j++] = (char)0xED;
				137	new_string[j++] = 0xA0 + (((u21 >> 16) - 1) & 0x0F);
				138	new_string[j++] = 0x80 + ((u21 >> 10) & 0x3F);
				139	/* Bits out: 11101101 1011xxxx 10xxxxxx */
				140	new_string[j++] = (char)0xED;
				141	new_string[j++] = 0xB0 + ((u21 >> 6) & 0x0F);
				142	new_string[j++] = byte4;
				143	}
				144	}
				145	new_string[j] = 0;
				146	}