Blame - jdk/src/share/npt/utf.c - platform/libcore

blob: 43a6fa0aaf34769bce70a30b077a273948833688 [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* Copyright 2004-2005 Sun Microsystems, Inc. All Rights Reserved.
				3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				4	*
				5	* This code is free software; you can redistribute it and/or modify it
				6	* under the terms of the GNU General Public License version 2 only, as
				7	* published by the Free Software Foundation. Sun designates this
				8	* particular file as subject to the "Classpath" exception as provided
				9	* by Sun in the LICENSE file that accompanied this code.
				10	*
				11	* This code is distributed in the hope that it will be useful, but WITHOUT
				12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				14	* version 2 for more details (a copy is included in the LICENSE file that
				15	* accompanied this code).
				16	*
				17	* You should have received a copy of the GNU General Public License version
				18	* 2 along with this work; if not, write to the Free Software Foundation,
				19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				20	*
				21	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				22	* CA 95054 USA or visit www.sun.com if you need additional information or
				23	* have any questions.
				24	*/
				25
				26	/* Misc functions for conversion of Unicode and UTF-8 and platform encoding */
				27
				28	#include <stdio.h>
				29	#include <stddef.h>
				30	#include <stdlib.h>
				31	#include <stdarg.h>
				32	#include <string.h>
				33	#include <ctype.h>
				34
				35	#include "jni.h"
				36
				37	#include "utf.h"
				38
				39	/*
				40	* Error handler
				41	*/
				42	void
				43	utfError(char file, int line, char message)
				44	{
				45	(void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message);
				46	abort();
				47	}
				48
				49	/*
				50	* Convert UTF-8 to UTF-16
				51	* Returns length or -1 if output overflows.
				52	*/
				53	int JNICALL
				54	utf8ToUtf16(struct UtfInst ui, jbyte utf8, int len, unsigned short *output, int outputMaxLen)
				55	{
				56	int outputLen;
				57	int i;
				58
				59	UTF_ASSERT(utf8);
				60	UTF_ASSERT(len>=0);
				61	UTF_ASSERT(output);
				62	UTF_ASSERT(outputMaxLen>0);
				63
				64	i = 0;
				65	outputLen = 0;
				66	while ( i<len ) {
				67	unsigned code, x, y, z;
				68
				69	if ( outputLen >= outputMaxLen ) {
				70	return -1;
				71	}
				72	x = (unsigned char)utf8[i++];
				73	code = x;
				74	if ( (x & 0xE0)==0xE0 ) {
				75	y = (unsigned char)utf8[i++];
				76	z = (unsigned char)utf8[i++];
				77	code = ((x & 0xF)<<12) + ((y & 0x3F)<<6) + (z & 0x3F);
				78	} else if ( (x & 0xC0)==0xC0 ) {
				79	y = (unsigned char)utf8[i++];
				80	code = ((x & 0x1F)<<6) + (y & 0x3F);
				81	}
				82	output[outputLen++] = code;
				83	}
				84	return outputLen;
				85	}
				86
				87	/*
				88	* Convert UTF-16 to UTF-8 Modified
				89	* Returns length or -1 if output overflows.
				90	*/
				91	int JNICALL
				92	utf16ToUtf8m(struct UtfInst ui, unsigned short utf16, int len, jbyte *output, int outputMaxLen)
				93	{
				94	int i;
				95	int outputLen;
				96
				97	UTF_ASSERT(utf16);
				98	UTF_ASSERT(len>=0);
				99	UTF_ASSERT(output);
				100	UTF_ASSERT(outputMaxLen>0);
				101
				102	outputLen = 0;
				103	for (i = 0; i < len; i++) {
				104	unsigned code;
				105
				106	code = utf16[i];
				107	if ( code >= 0x0001 && code <= 0x007F ) {
				108	output[outputLen++] = code;
				109	} else if ( code == 0 \|\| ( code >= 0x0080 && code <= 0x07FF ) ) {
				110	output[outputLen++] = ((code>>6) & 0x1F) \| 0xC0;
				111	output[outputLen++] = (code & 0x3F) \| 0x80;
				112	} else if ( code >= 0x0800 && code <= 0xFFFF ) {
				113	output[outputLen++] = ((code>>12) & 0x0F) \| 0xE0;
				114	output[outputLen++] = ((code>>6) & 0x3F) \| 0x80;
				115	output[outputLen++] = (code & 0x3F) \| 0x80;
				116	}
				117	if ( outputLen > outputMaxLen ) {
				118	return -1;
				119	}
				120	}
				121	output[outputLen] = 0;
				122	return outputLen;
				123	}
				124
				125	int JNICALL
				126	utf16ToUtf8s(struct UtfInst ui, unsigned short utf16, int len, jbyte *output, int outputMaxLen)
				127	{
				128	return -1; /* FIXUP */
				129	}
				130
				131	/* Determine length of this Standard UTF-8 in Modified UTF-8.
				132	* Validation is done of the basic UTF encoding rules, returns
				133	* length (no change) when errors are detected in the UTF encoding.
				134	*
				135	* Note: Accepts Modified UTF-8 also, no verification on the
				136	* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
				137	*/
				138	int JNICALL
				139	utf8sToUtf8mLength(struct UtfInst ui, jbyte string, int length)
				140	{
				141	int newLength;
				142	int i;
				143
				144	newLength = 0;
				145	for ( i = 0 ; i < length ; i++ ) {
				146	unsigned byte;
				147
				148	byte = (unsigned char)string[i];
				149	if ( (byte & 0x80) == 0 ) { /* 1byte encoding */
				150	newLength++;
				151	if ( byte == 0 ) {
				152	newLength++; /* We gain one byte in length on NULL bytes */
				153	}
				154	} else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */
				155	/* Check encoding of following bytes */
				156	if ( (i+1) >= length \|\| (string[i+1] & 0xC0) != 0x80 ) {
				157	break; /* Error condition */
				158	}
				159	i++; /* Skip next byte */
				160	newLength += 2;
				161	} else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */
				162	/* Check encoding of following bytes */
				163	if ( (i+2) >= length \|\| (string[i+1] & 0xC0) != 0x80
				164	\|\| (string[i+2] & 0xC0) != 0x80 ) {
				165	break; /* Error condition */
				166	}
				167	i += 2; /* Skip next two bytes */
				168	newLength += 3;
				169	} else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */
				170	/* Check encoding of following bytes */
				171	if ( (i+3) >= length \|\| (string[i+1] & 0xC0) != 0x80
				172	\|\| (string[i+2] & 0xC0) != 0x80
				173	\|\| (string[i+3] & 0xC0) != 0x80 ) {
				174	break; /* Error condition */
				175	}
				176	i += 3; /* Skip next 3 bytes */
				177	newLength += 6; /* 4byte encoding turns into 2 3byte ones */
				178	} else {
				179	break; /* Error condition */
				180	}
				181	}
				182	if ( i != length ) {
				183	/* Error in finding new length, return old length so no conversion */
				184	/* FIXUP: ERROR_MESSAGE? */
				185	return length;
				186	}
				187	return newLength;
				188	}
				189
				190	/* Convert Standard UTF-8 to Modified UTF-8.
				191	* Assumes the UTF-8 encoding was validated by utf8mLength() above.
				192	*
				193	* Note: Accepts Modified UTF-8 also, no verification on the
				194	* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
				195	*/
				196	void JNICALL
				197	utf8sToUtf8m(struct UtfInst ui, jbyte string, int length, jbyte *newString, int newLength)
				198	{
				199	int i;
				200	int j;
				201
				202	j = 0;
				203	for ( i = 0 ; i < length ; i++ ) {
				204	unsigned byte1;
				205
				206	byte1 = (unsigned char)string[i];
				207
				208	/* NULL bytes and bytes starting with 11110xxx are special */
				209	if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
				210	if ( byte1 == 0 ) {
				211	/* Bits out: 11000000 10000000 */
				212	newString[j++] = (jbyte)0xC0;
				213	newString[j++] = (jbyte)0x80;
				214	} else {
				215	/* Single byte */
				216	newString[j++] = byte1;
				217	}
				218	} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
				219	newString[j++] = byte1;
				220	newString[j++] = string[++i];
				221	} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
				222	newString[j++] = byte1;
				223	newString[j++] = string[++i];
				224	newString[j++] = string[++i];
				225	} else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */
				226	/* Beginning of 4byte encoding, turn into 2 3byte encodings */
				227	unsigned byte2, byte3, byte4, u21;
				228
				229	/* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
				230	byte2 = (unsigned char)string[++i];
				231	byte3 = (unsigned char)string[++i];
				232	byte4 = (unsigned char)string[++i];
				233	/* Reconstruct full 21bit value */
				234	u21 = (byte1 & 0x07) << 18;
				235	u21 += (byte2 & 0x3F) << 12;
				236	u21 += (byte3 & 0x3F) << 6;
				237	u21 += (byte4 & 0x3F);
				238	/* Bits out: 11101101 1010xxxx 10xxxxxx */
				239	newString[j++] = (jbyte)0xED;
				240	newString[j++] = (jbyte)(0xA0 + (((u21 >> 16) - 1) & 0x0F));
				241	newString[j++] = (jbyte)(0x80 + ((u21 >> 10) & 0x3F));
				242	/* Bits out: 11101101 1011xxxx 10xxxxxx */
				243	newString[j++] = (jbyte)0xED;
				244	newString[j++] = (jbyte)(0xB0 + ((u21 >> 6) & 0x0F));
				245	newString[j++] = byte4;
				246	}
				247	}
				248	UTF_ASSERT(i==length);
				249	UTF_ASSERT(j==newLength);
				250	newString[j] = (jbyte)0;
				251	}
				252
				253	/* Given a Modified UTF-8 string, calculate the Standard UTF-8 length.
				254	* Basic validation of the UTF encoding rules is done, and length is
				255	* returned (no change) when errors are detected.
				256	*
				257	* Note: No validation is made that this is indeed Modified UTF-8 coming in.
				258	*
				259	*/
				260	int JNICALL
				261	utf8mToUtf8sLength(struct UtfInst ui, jbyte string, int length)
				262	{
				263	int newLength;
				264	int i;
				265
				266	newLength = 0;
				267	for ( i = 0 ; i < length ; i++ ) {
				268	unsigned byte1, byte2, byte3, byte4, byte5, byte6;
				269
				270	byte1 = (unsigned char)string[i];
				271	if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
				272	newLength++;
				273	} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
				274	/* Check encoding of following bytes */
				275	if ( (i+1) >= length \|\| (string[i+1] & 0xC0) != 0x80 ) {
				276	break; /* Error condition */
				277	}
				278	byte2 = (unsigned char)string[++i];
				279	if ( byte1 != 0xC0 \|\| byte2 != 0x80 ) {
				280	newLength += 2; /* Normal 2byte encoding, not 0xC080 */
				281	} else {
				282	newLength++; /* We will turn 0xC080 into 0 */
				283	}
				284	} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
				285	/* Check encoding of following bytes */
				286	if ( (i+2) >= length \|\| (string[i+1] & 0xC0) != 0x80
				287	\|\| (string[i+2] & 0xC0) != 0x80 ) {
				288	break; /* Error condition */
				289	}
				290	byte2 = (unsigned char)string[++i];
				291	byte3 = (unsigned char)string[++i];
				292	newLength += 3;
				293	/* Possible process a second 3byte encoding */
				294	if ( (i+3) < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
				295	/* See if this is a pair of 3byte encodings */
				296	byte4 = (unsigned char)string[i+1];
				297	byte5 = (unsigned char)string[i+2];
				298	byte6 = (unsigned char)string[i+3];
				299	if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
				300	/* Check encoding of 3rd byte */
				301	if ( (byte6 & 0xC0) != 0x80 ) {
				302	break; /* Error condition */
				303	}
				304	newLength++; /* New string will have 4byte encoding */
				305	i += 3; /* Skip next 3 bytes */
				306	}
				307	}
				308	} else {
				309	break; /* Error condition */
				310	}
				311	}
				312	if ( i != length ) {
				313	/* Error in UTF encoding */
				314	/* FIXUP: ERROR_MESSAGE()? */
				315	return length;
				316	}
				317	return newLength;
				318	}
				319
				320	/* Convert a Modified UTF-8 string into a Standard UTF-8 string
				321	* It is assumed that this string has been validated in terms of the
				322	* basic UTF encoding rules by utf8Length() above.
				323	*
				324	* Note: No validation is made that this is indeed Modified UTF-8 coming in.
				325	*
				326	*/
				327	void JNICALL
				328	utf8mToUtf8s(struct UtfInst ui, jbyte string, int length, jbyte *newString, int newLength)
				329	{
				330	int i;
				331	int j;
				332
				333	j = 0;
				334	for ( i = 0 ; i < length ; i++ ) {
				335	unsigned byte1, byte2, byte3, byte4, byte5, byte6;
				336
				337	byte1 = (unsigned char)string[i];
				338	if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
				339	/* Single byte */
				340	newString[j++] = byte1;
				341	} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
				342	byte2 = (unsigned char)string[++i];
				343	if ( byte1 != 0xC0 \|\| byte2 != 0x80 ) {
				344	newString[j++] = byte1;
				345	newString[j++] = byte2;
				346	} else {
				347	newString[j++] = 0;
				348	}
				349	} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
				350	byte2 = (unsigned char)string[++i];
				351	byte3 = (unsigned char)string[++i];
				352	if ( i+3 < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
				353	/* See if this is a pair of 3byte encodings */
				354	byte4 = (unsigned char)string[i+1];
				355	byte5 = (unsigned char)string[i+2];
				356	byte6 = (unsigned char)string[i+3];
				357	if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
				358	unsigned u21;
				359
				360	/* Bits in: 11101101 1010xxxx 10xxxxxx */
				361	/* Bits in: 11101101 1011xxxx 10xxxxxx */
				362	i += 3;
				363
				364	/* Reconstruct 21 bit code */
				365	u21 = ((byte2 & 0x0F) + 1) << 16;
				366	u21 += (byte3 & 0x3F) << 10;
				367	u21 += (byte5 & 0x0F) << 6;
				368	u21 += (byte6 & 0x3F);
				369
				370	/* Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
				371
				372	/* Convert to 4byte encoding */
				373	newString[j++] = 0xF0 + ((u21 >> 18) & 0x07);
				374	newString[j++] = 0x80 + ((u21 >> 12) & 0x3F);
				375	newString[j++] = 0x80 + ((u21 >> 6) & 0x3F);
				376	newString[j++] = 0x80 + (u21 & 0x3F);
				377	continue;
				378	}
				379	}
				380	/* Normal 3byte encoding */
				381	newString[j++] = byte1;
				382	newString[j++] = byte2;
				383	newString[j++] = byte3;
				384	}
				385	}
				386	UTF_ASSERT(i==length);
				387	UTF_ASSERT(j==newLength);
				388	newString[j] = 0;
				389	}
				390
				391	/* ================================================================= */
				392
				393	#if 1 /* Test program */
				394
				395	/*
				396	* Convert any byte array into a printable string.
				397	* Returns length or -1 if output overflows.
				398	*/
				399	static int
				400	bytesToPrintable(struct UtfInst ui, char bytes, int len, char *output, int outputMaxLen)
				401	{
				402	int outputLen;
				403	int i;
				404
				405	UTF_ASSERT(bytes);
				406	UTF_ASSERT(len>=0);
				407	UTF_ASSERT(output);
				408	UTF_ASSERT(outputMaxLen>=0);
				409
				410	outputLen = 0;
				411	for ( i=0; i<len ; i++ ) {
				412	unsigned byte;
				413
				414	byte = bytes[i];
				415	if ( outputLen >= outputMaxLen ) {
				416	return -1;
				417	}
				418	if ( byte <= 0x7f && isprint(byte) && !iscntrl(byte) ) {
				419	output[outputLen++] = (char)byte;
				420	} else {
				421	(void)sprintf(output+outputLen,"\\x%02x",byte);
				422	outputLen += 4;
				423	}
				424	}
				425	output[outputLen] = 0;
				426	return outputLen;
				427	}
				428
				429	static void
				430	test(void)
				431	{
				432	static char *strings[] = {
				433	"characters",
				434	"abcdefghijklmnopqrstuvwxyz",
				435	"0123456789",
				436	"!@#$%^&*()_+=-{}[]:;",
				437	NULL };
				438	int i;
				439	struct UtfInst *ui;
				440
				441	ui = utfInitialize(NULL);
				442
				443	i = 0;
				444	while ( strings[i] != NULL ) {
				445	char *str;
				446	#define MAX 1024
				447	char buf0[MAX];
				448	char buf1[MAX];
				449	char buf2[MAX];
				450	unsigned short buf3[MAX];
				451	int len1;
				452	int len2;
				453	int len3;
				454
				455	str = strings[i];
				456
				457	(void)bytesToPrintable(ui, str, (int)strlen(str), buf0, 1024);
				458
				459	len1 = utf8FromPlatform(ui, str, (int)strlen(str), (jbyte*)buf1, 1024);
				460
				461	UTF_ASSERT(len1==(int)strlen(str));
				462
				463	len3 = utf8ToUtf16(ui, (jbyte)buf1, len1, (jchar)buf3, 1024);
				464
				465	UTF_ASSERT(len3==len1);
				466
				467	len1 = utf16ToUtf8m(ui, (jchar)buf3, len3, (jbyte)buf1, 1024);
				468
				469	UTF_ASSERT(len1==len3);
				470	UTF_ASSERT(strcmp(str, buf1) == 0);
				471
				472	len2 = utf8ToPlatform(ui, (jbyte*)buf1, len1, buf2, 1024);
				473
				474	UTF_ASSERT(len2==len1);
				475	UTF_ASSERT(strcmp(str, buf2) == 0);
				476
				477	i++;
				478	}
				479
				480	utfTerminate(ui, NULL);
				481
				482	}
				483
				484	int
				485	main(int argc, char **argv)
				486	{
				487	test();
				488	return 0;
				489	}
				490
				491	#endif