Blame - jdk/src/share/classes/sun/net/idn/Punycode.java - platform/libcore

blob: b4696d23f3cec5edc801a87b50094133567f46d0 [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				3	*
				4	* This code is free software; you can redistribute it and/or modify it
				5	* under the terms of the GNU General Public License version 2 only, as
				6	* published by the Free Software Foundation. Sun designates this
				7	* particular file as subject to the "Classpath" exception as provided
				8	* by Sun in the LICENSE file that accompanied this code.
				9	*
				10	* This code is distributed in the hope that it will be useful, but WITHOUT
				11	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				12	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				13	* version 2 for more details (a copy is included in the LICENSE file that
				14	* accompanied this code).
				15	*
				16	* You should have received a copy of the GNU General Public License version
				17	* 2 along with this work; if not, write to the Free Software Foundation,
				18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				19	*
				20	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				21	* CA 95054 USA or visit www.sun.com if you need additional information or
				22	* have any questions.
				23	*/
				24	/*
				25	*******************************************************************************
				26	* Copyright (C) 2003-2004, International Business Machines Corporation and *
				27	* others. All Rights Reserved. *
				28	*******************************************************************************
				29	*/
				30	//
				31	// CHANGELOG
				32	// 2005-05-19 Edward Wang
				33	// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
				34	// - move from package com.ibm.icu.text to package sun.net.idn
				35	// - use ParseException instead of StringPrepParseException
				36	// 2007-08-14 Martin Buchholz
				37	// - remove redundant casts
				38	//
				39	package sun.net.idn;
				40
				41	import java.text.ParseException;
				42	import sun.text.normalizer.UCharacter;
				43	import sun.text.normalizer.UTF16;
				44
				45	/**
				46	* Ported code from ICU punycode.c
				47	* @author ram
				48	*/
				49
				50	/* Package Private class */
				51	public final class Punycode {
				52
				53	/* Punycode parameters for Bootstring */
				54	private static final int BASE = 36;
				55	private static final int TMIN = 1;
				56	private static final int TMAX = 26;
				57	private static final int SKEW = 38;
				58	private static final int DAMP = 700;
				59	private static final int INITIAL_BIAS = 72;
				60	private static final int INITIAL_N = 0x80;
				61
				62	/* "Basic" Unicode/ASCII code points */
				63	private static final int HYPHEN = 0x2d;
				64	private static final int DELIMITER = HYPHEN;
				65
				66	private static final int ZERO = 0x30;
				67	private static final int NINE = 0x39;
				68
				69	private static final int SMALL_A = 0x61;
				70	private static final int SMALL_Z = 0x7a;
				71
				72	private static final int CAPITAL_A = 0x41;
				73	private static final int CAPITAL_Z = 0x5a;
				74
				75	// TODO: eliminate the 256 limitation
				76	private static final int MAX_CP_COUNT = 256;
				77
				78	private static final int UINT_MAGIC = 0x80000000;
				79	private static final long ULONG_MAGIC = 0x8000000000000000L;
				80
				81	private static int adaptBias(int delta, int length, boolean firstTime){
				82	if(firstTime){
				83	delta /=DAMP;
				84	}else{
				85	delta /= 2;
				86	}
				87	delta += delta/length;
				88
				89	int count=0;
				90	for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
				91	delta/=(BASE-TMIN);
				92	}
				93
				94	return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
				95	}
				96
				97	/**
				98	* basicToDigit[] contains the numeric value of a basic code
				99	* point (for use in representing integers) in the range 0 to
				100	* BASE-1, or -1 if b is does not represent a value.
				101	*/
				102	static final int[] basicToDigit= new int[]{
				103	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				104	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				105
				106	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				107	26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
				108
				109	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
				110	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
				111
				112	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
				113	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
				114
				115	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				116	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				117
				118	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				119	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				120
				121	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				122	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				123
				124	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
				125	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
				126	};
				127
				128	private static char asciiCaseMap(char b, boolean uppercase) {
				129	if(uppercase) {
				130	if(SMALL_A<=b && b<=SMALL_Z) {
				131	b-=(SMALL_A-CAPITAL_A);
				132	}
				133	} else {
				134	if(CAPITAL_A<=b && b<=CAPITAL_Z) {
				135	b+=(SMALL_A-CAPITAL_A);
				136	}
				137	}
				138	return b;
				139	}
				140
				141	/**
				142	* digitToBasic() returns the basic code point whose value
				143	* (when used for representing integers) is d, which must be in the
				144	* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
				145	* nonzero, in which case the uppercase form is used.
				146	*/
				147	private static char digitToBasic(int digit, boolean uppercase) {
				148	/* 0..25 map to ASCII a..z or A..Z */
				149	/* 26..35 map to ASCII 0..9 */
				150	if(digit<26) {
				151	if(uppercase) {
				152	return (char)(CAPITAL_A+digit);
				153	} else {
				154	return (char)(SMALL_A+digit);
				155	}
				156	} else {
				157	return (char)((ZERO-26)+digit);
				158	}
				159	}
				160	/**
				161	* Converts Unicode to Punycode.
				162	* The input string must not contain single, unpaired surrogates.
				163	* The output will be represented as an array of ASCII code points.
				164	*
				165	* @param src
				166	* @param caseFlags
				167	* @return
				168	* @throws ParseException
				169	*/
				170	public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{
				171
				172	int[] cpBuffer = new int[MAX_CP_COUNT];
				173	int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
				174	char c, c2;
				175	int srcLength = src.length();
				176	int destCapacity = MAX_CP_COUNT;
				177	char[] dest = new char[destCapacity];
				178	StringBuffer result = new StringBuffer();
				179	/*
				180	* Handle the basic code points and
				181	* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
				182	*/
				183	srcCPCount=destLength=0;
				184
				185	for(j=0; j<srcLength; ++j) {
				186	if(srcCPCount==MAX_CP_COUNT) {
				187	/* too many input code points */
				188	throw new IndexOutOfBoundsException();
				189	}
				190	c=src.charAt(j);
				191	if(isBasic(c)) {
				192	if(destLength<destCapacity) {
				193	cpBuffer[srcCPCount++]=0;
				194	dest[destLength]=
				195	caseFlags!=null ?
				196	asciiCaseMap(c, caseFlags[j]) :
				197	c;
				198	}
				199	++destLength;
				200	} else {
				201	n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
				202	if(!UTF16.isSurrogate(c)) {
				203	n\|=c;
				204	} else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
				205	++j;
				206
				207	n\|=UCharacter.getCodePoint(c, c2);
				208	} else {
				209	/* error: unmatched surrogate */
				210	throw new ParseException("Illegal char found", -1);
				211	}
				212	cpBuffer[srcCPCount++]=n;
				213	}
				214	}
				215
				216	/* Finish the basic string - if it is not empty - with a delimiter. */
				217	basicLength=destLength;
				218	if(basicLength>0) {
				219	if(destLength<destCapacity) {
				220	dest[destLength]=DELIMITER;
				221	}
				222	++destLength;
				223	}
				224
				225	/*
				226	* handledCPCount is the number of code points that have been handled
				227	* basicLength is the number of basic code points
				228	* destLength is the number of chars that have been output
				229	*/
				230
				231	/* Initialize the state: */
				232	n=INITIAL_N;
				233	delta=0;
				234	bias=INITIAL_BIAS;
				235
				236	/* Main encoding loop: */
				237	for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
				238	/*
				239	* All non-basic code points < n have been handled already.
				240	* Find the next larger one:
				241	*/
				242	for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
				243	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
				244	if(n<=q && q<m) {
				245	m=q;
				246	}
				247	}
				248
				249	/*
				250	* Increase delta enough to advance the decoder's
				251	* <n,i> state to <m,0>, but guard against overflow:
				252	*/
				253	if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
				254	throw new RuntimeException("Internal program error");
				255	}
				256	delta+=(m-n)*(handledCPCount+1);
				257	n=m;
				258
				259	/* Encode a sequence of same code points n */
				260	for(j=0; j<srcCPCount; ++j) {
				261	q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
				262	if(q<n) {
				263	++delta;
				264	} else if(q==n) {
				265	/* Represent delta as a generalized variable-length integer: */
				266	for(q=delta, k=BASE; /* no condition */; k+=BASE) {
				267
				268	/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
				269
				270	t=k-bias;
				271	if(t<TMIN) {
				272	t=TMIN;
				273	} else if(t>TMAX) {
				274	t=TMAX;
				275	}
				276	*/
				277
				278	t=k-bias;
				279	if(t<TMIN) {
				280	t=TMIN;
				281	} else if(k>=(bias+TMAX)) {
				282	t=TMAX;
				283	}
				284
				285	if(q<t) {
				286	break;
				287	}
				288
				289	if(destLength<destCapacity) {
				290	dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
				291	}
				292	q=(q-t)/(BASE-t);
				293	}
				294
				295	if(destLength<destCapacity) {
				296	dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
				297	}
				298	bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
				299	delta=0;
				300	++handledCPCount;
				301	}
				302	}
				303
				304	++delta;
				305	++n;
				306	}
				307
				308	return result.append(dest, 0, destLength);
				309	}
				310
				311	private static boolean isBasic(int ch){
				312	return (ch < INITIAL_N);
				313	}
				314
				315	private static boolean isBasicUpperCase(int ch){
				316	return( CAPITAL_A <= ch && ch <= CAPITAL_Z);
				317	}
				318
				319	private static boolean isSurrogate(int ch){
				320	return (((ch)&0xfffff800)==0xd800);
				321	}
				322	/**
				323	* Converts Punycode to Unicode.
				324	* The Unicode string will be at most as long as the Punycode string.
				325	*
				326	* @param src
				327	* @param caseFlags
				328	* @return
				329	* @throws ParseException
				330	*/
				331	public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
				332	throws ParseException{
				333	int srcLength = src.length();
				334	StringBuffer result = new StringBuffer();
				335	int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
				336	destCPCount, firstSupplementaryIndex, cpLength;
				337	char b;
				338	int destCapacity = MAX_CP_COUNT;
				339	char[] dest = new char[destCapacity];
				340
				341	/*
				342	* Handle the basic code points:
				343	* Let basicLength be the number of input code points
				344	* before the last delimiter, or 0 if there is none,
				345	* then copy the first basicLength code points to the output.
				346	*
				347	* The two following loops iterate backward.
				348	*/
				349	for(j=srcLength; j>0;) {
				350	if(src.charAt(--j)==DELIMITER) {
				351	break;
				352	}
				353	}
				354	destLength=basicLength=destCPCount=j;
				355
				356	while(j>0) {
				357	b=src.charAt(--j);
				358	if(!isBasic(b)) {
				359	throw new ParseException("Illegal char found", -1);
				360	}
				361
				362	if(j<destCapacity) {
				363	dest[j]= b;
				364
				365	if(caseFlags!=null) {
				366	caseFlags[j]=isBasicUpperCase(b);
				367	}
				368	}
				369	}
				370
				371	/* Initialize the state: */
				372	n=INITIAL_N;
				373	i=0;
				374	bias=INITIAL_BIAS;
				375	firstSupplementaryIndex=1000000000;
				376
				377	/*
				378	* Main decoding loop:
				379	* Start just after the last delimiter if any
				380	* basic code points were copied; start at the beginning otherwise.
				381	*/
				382	for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
				383	/*
				384	* in is the index of the next character to be consumed, and
				385	* destCPCount is the number of code points in the output array.
				386	*
				387	* Decode a generalized variable-length integer into delta,
				388	* which gets added to i. The overflow checking is easier
				389	* if we increase i as we go, then subtract off its starting
				390	* value at the end to obtain delta.
				391	*/
				392	for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
				393	if(in>=srcLength) {
				394	throw new ParseException("Illegal char found", -1);
				395	}
				396
				397	digit=basicToDigit[(byte)src.charAt(in++)];
				398	if(digit<0) {
				399	throw new ParseException("Invalid char found", -1);
				400	}
				401	if(digit>(0x7fffffff-i)/w) {
				402	/* integer overflow */
				403	throw new ParseException("Illegal char found", -1);
				404	}
				405
				406	i+=digit*w;
				407	t=k-bias;
				408	if(t<TMIN) {
				409	t=TMIN;
				410	} else if(k>=(bias+TMAX)) {
				411	t=TMAX;
				412	}
				413	if(digit<t) {
				414	break;
				415	}
				416
				417	if(w>0x7fffffff/(BASE-t)) {
				418	/* integer overflow */
				419	throw new ParseException("Illegal char found", -1);
				420	}
				421	w*=BASE-t;
				422	}
				423
				424	/*
				425	* Modification from sample code:
				426	* Increments destCPCount here,
				427	* where needed instead of in for() loop tail.
				428	*/
				429	++destCPCount;
				430	bias=adaptBias(i-oldi, destCPCount, (oldi==0));
				431
				432	/*
				433	* i was supposed to wrap around from (incremented) destCPCount to 0,
				434	* incrementing n each time, so we'll fix that now:
				435	*/
				436	if(i/destCPCount>(0x7fffffff-n)) {
				437	/* integer overflow */
				438	throw new ParseException("Illegal char found", -1);
				439	}
				440
				441	n+=i/destCPCount;
				442	i%=destCPCount;
				443	/* not needed for Punycode: */
				444	/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
				445
				446	if(n>0x10ffff \|\| isSurrogate(n)) {
				447	/* Unicode code point overflow */
				448	throw new ParseException("Illegal char found", -1);
				449	}
				450
				451	/* Insert n at position i of the output: */
				452	cpLength=UTF16.getCharCount(n);
				453	if((destLength+cpLength)<destCapacity) {
				454	int codeUnitIndex;
				455
				456	/*
				457	* Handle indexes when supplementary code points are present.
				458	*
				459	* In almost all cases, there will be only BMP code points before i
				460	* and even in the entire string.
				461	* This is handled with the same efficiency as with UTF-32.
				462	*
				463	* Only the rare cases with supplementary code points are handled
				464	* more slowly - but not too bad since this is an insertion anyway.
				465	*/
				466	if(i<=firstSupplementaryIndex) {
				467	codeUnitIndex=i;
				468	if(cpLength>1) {
				469	firstSupplementaryIndex=codeUnitIndex;
				470	} else {
				471	++firstSupplementaryIndex;
				472	}
				473	} else {
				474	codeUnitIndex=firstSupplementaryIndex;
				475	codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
				476	}
				477
				478	/* use the UChar index codeUnitIndex instead of the code point index i */
				479	if(codeUnitIndex<destLength) {
				480	System.arraycopy(dest, codeUnitIndex,
				481	dest, codeUnitIndex+cpLength,
				482	(destLength-codeUnitIndex));
				483	if(caseFlags!=null) {
				484	System.arraycopy(caseFlags, codeUnitIndex,
				485	caseFlags, codeUnitIndex+cpLength,
				486	destLength-codeUnitIndex);
				487	}
				488	}
				489	if(cpLength==1) {
				490	/* BMP, insert one code unit */
				491	dest[codeUnitIndex]=(char)n;
				492	} else {
				493	/* supplementary character, insert two code units */
				494	dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
				495	dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
				496	}
				497	if(caseFlags!=null) {
				498	/* Case of last character determines uppercase flag: */
				499	caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
				500	if(cpLength==2) {
				501	caseFlags[codeUnitIndex+1]=false;
				502	}
				503	}
				504	}
				505	destLength+=cpLength;
				506	++i;
				507	}
				508	result.append(dest, 0, destLength);
				509	return result;
				510	}
				511	}