blob: b4696d23f3cec5edc801a87b50094133567f46d0 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3 *
4 * This code is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License version 2 only, as
6 * published by the Free Software Foundation. Sun designates this
7 * particular file as subject to the "Classpath" exception as provided
8 * by Sun in the LICENSE file that accompanied this code.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 */
24/*
25 *******************************************************************************
26 * Copyright (C) 2003-2004, International Business Machines Corporation and *
27 * others. All Rights Reserved. *
28 *******************************************************************************
29 */
30//
31// CHANGELOG
32// 2005-05-19 Edward Wang
33// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
34// - move from package com.ibm.icu.text to package sun.net.idn
35// - use ParseException instead of StringPrepParseException
36// 2007-08-14 Martin Buchholz
37// - remove redundant casts
38//
39package sun.net.idn;
40
41import java.text.ParseException;
42import sun.text.normalizer.UCharacter;
43import sun.text.normalizer.UTF16;
44
45/**
46 * Ported code from ICU punycode.c
47 * @author ram
48 */
49
50/* Package Private class */
51public final class Punycode {
52
53 /* Punycode parameters for Bootstring */
54 private static final int BASE = 36;
55 private static final int TMIN = 1;
56 private static final int TMAX = 26;
57 private static final int SKEW = 38;
58 private static final int DAMP = 700;
59 private static final int INITIAL_BIAS = 72;
60 private static final int INITIAL_N = 0x80;
61
62 /* "Basic" Unicode/ASCII code points */
63 private static final int HYPHEN = 0x2d;
64 private static final int DELIMITER = HYPHEN;
65
66 private static final int ZERO = 0x30;
67 private static final int NINE = 0x39;
68
69 private static final int SMALL_A = 0x61;
70 private static final int SMALL_Z = 0x7a;
71
72 private static final int CAPITAL_A = 0x41;
73 private static final int CAPITAL_Z = 0x5a;
74
75 // TODO: eliminate the 256 limitation
76 private static final int MAX_CP_COUNT = 256;
77
78 private static final int UINT_MAGIC = 0x80000000;
79 private static final long ULONG_MAGIC = 0x8000000000000000L;
80
81 private static int adaptBias(int delta, int length, boolean firstTime){
82 if(firstTime){
83 delta /=DAMP;
84 }else{
85 delta /= 2;
86 }
87 delta += delta/length;
88
89 int count=0;
90 for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
91 delta/=(BASE-TMIN);
92 }
93
94 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
95 }
96
97 /**
98 * basicToDigit[] contains the numeric value of a basic code
99 * point (for use in representing integers) in the range 0 to
100 * BASE-1, or -1 if b is does not represent a value.
101 */
102 static final int[] basicToDigit= new int[]{
103 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
104 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
105
106 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
107 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
108
109 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
110 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
111
112 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
113 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
114
115 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
116 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
117
118 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
119 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
120
121 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
122 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
123
124 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
125 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
126 };
127
128 private static char asciiCaseMap(char b, boolean uppercase) {
129 if(uppercase) {
130 if(SMALL_A<=b && b<=SMALL_Z) {
131 b-=(SMALL_A-CAPITAL_A);
132 }
133 } else {
134 if(CAPITAL_A<=b && b<=CAPITAL_Z) {
135 b+=(SMALL_A-CAPITAL_A);
136 }
137 }
138 return b;
139 }
140
141 /**
142 * digitToBasic() returns the basic code point whose value
143 * (when used for representing integers) is d, which must be in the
144 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
145 * nonzero, in which case the uppercase form is used.
146 */
147 private static char digitToBasic(int digit, boolean uppercase) {
148 /* 0..25 map to ASCII a..z or A..Z */
149 /* 26..35 map to ASCII 0..9 */
150 if(digit<26) {
151 if(uppercase) {
152 return (char)(CAPITAL_A+digit);
153 } else {
154 return (char)(SMALL_A+digit);
155 }
156 } else {
157 return (char)((ZERO-26)+digit);
158 }
159 }
160 /**
161 * Converts Unicode to Punycode.
162 * The input string must not contain single, unpaired surrogates.
163 * The output will be represented as an array of ASCII code points.
164 *
165 * @param src
166 * @param caseFlags
167 * @return
168 * @throws ParseException
169 */
170 public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{
171
172 int[] cpBuffer = new int[MAX_CP_COUNT];
173 int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
174 char c, c2;
175 int srcLength = src.length();
176 int destCapacity = MAX_CP_COUNT;
177 char[] dest = new char[destCapacity];
178 StringBuffer result = new StringBuffer();
179 /*
180 * Handle the basic code points and
181 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
182 */
183 srcCPCount=destLength=0;
184
185 for(j=0; j<srcLength; ++j) {
186 if(srcCPCount==MAX_CP_COUNT) {
187 /* too many input code points */
188 throw new IndexOutOfBoundsException();
189 }
190 c=src.charAt(j);
191 if(isBasic(c)) {
192 if(destLength<destCapacity) {
193 cpBuffer[srcCPCount++]=0;
194 dest[destLength]=
195 caseFlags!=null ?
196 asciiCaseMap(c, caseFlags[j]) :
197 c;
198 }
199 ++destLength;
200 } else {
201 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
202 if(!UTF16.isSurrogate(c)) {
203 n|=c;
204 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
205 ++j;
206
207 n|=UCharacter.getCodePoint(c, c2);
208 } else {
209 /* error: unmatched surrogate */
210 throw new ParseException("Illegal char found", -1);
211 }
212 cpBuffer[srcCPCount++]=n;
213 }
214 }
215
216 /* Finish the basic string - if it is not empty - with a delimiter. */
217 basicLength=destLength;
218 if(basicLength>0) {
219 if(destLength<destCapacity) {
220 dest[destLength]=DELIMITER;
221 }
222 ++destLength;
223 }
224
225 /*
226 * handledCPCount is the number of code points that have been handled
227 * basicLength is the number of basic code points
228 * destLength is the number of chars that have been output
229 */
230
231 /* Initialize the state: */
232 n=INITIAL_N;
233 delta=0;
234 bias=INITIAL_BIAS;
235
236 /* Main encoding loop: */
237 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
238 /*
239 * All non-basic code points < n have been handled already.
240 * Find the next larger one:
241 */
242 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
243 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
244 if(n<=q && q<m) {
245 m=q;
246 }
247 }
248
249 /*
250 * Increase delta enough to advance the decoder's
251 * <n,i> state to <m,0>, but guard against overflow:
252 */
253 if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
254 throw new RuntimeException("Internal program error");
255 }
256 delta+=(m-n)*(handledCPCount+1);
257 n=m;
258
259 /* Encode a sequence of same code points n */
260 for(j=0; j<srcCPCount; ++j) {
261 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
262 if(q<n) {
263 ++delta;
264 } else if(q==n) {
265 /* Represent delta as a generalized variable-length integer: */
266 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
267
268 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
269
270 t=k-bias;
271 if(t<TMIN) {
272 t=TMIN;
273 } else if(t>TMAX) {
274 t=TMAX;
275 }
276 */
277
278 t=k-bias;
279 if(t<TMIN) {
280 t=TMIN;
281 } else if(k>=(bias+TMAX)) {
282 t=TMAX;
283 }
284
285 if(q<t) {
286 break;
287 }
288
289 if(destLength<destCapacity) {
290 dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
291 }
292 q=(q-t)/(BASE-t);
293 }
294
295 if(destLength<destCapacity) {
296 dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
297 }
298 bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
299 delta=0;
300 ++handledCPCount;
301 }
302 }
303
304 ++delta;
305 ++n;
306 }
307
308 return result.append(dest, 0, destLength);
309 }
310
311 private static boolean isBasic(int ch){
312 return (ch < INITIAL_N);
313 }
314
315 private static boolean isBasicUpperCase(int ch){
316 return( CAPITAL_A <= ch && ch <= CAPITAL_Z);
317 }
318
319 private static boolean isSurrogate(int ch){
320 return (((ch)&0xfffff800)==0xd800);
321 }
322 /**
323 * Converts Punycode to Unicode.
324 * The Unicode string will be at most as long as the Punycode string.
325 *
326 * @param src
327 * @param caseFlags
328 * @return
329 * @throws ParseException
330 */
331 public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
332 throws ParseException{
333 int srcLength = src.length();
334 StringBuffer result = new StringBuffer();
335 int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
336 destCPCount, firstSupplementaryIndex, cpLength;
337 char b;
338 int destCapacity = MAX_CP_COUNT;
339 char[] dest = new char[destCapacity];
340
341 /*
342 * Handle the basic code points:
343 * Let basicLength be the number of input code points
344 * before the last delimiter, or 0 if there is none,
345 * then copy the first basicLength code points to the output.
346 *
347 * The two following loops iterate backward.
348 */
349 for(j=srcLength; j>0;) {
350 if(src.charAt(--j)==DELIMITER) {
351 break;
352 }
353 }
354 destLength=basicLength=destCPCount=j;
355
356 while(j>0) {
357 b=src.charAt(--j);
358 if(!isBasic(b)) {
359 throw new ParseException("Illegal char found", -1);
360 }
361
362 if(j<destCapacity) {
363 dest[j]= b;
364
365 if(caseFlags!=null) {
366 caseFlags[j]=isBasicUpperCase(b);
367 }
368 }
369 }
370
371 /* Initialize the state: */
372 n=INITIAL_N;
373 i=0;
374 bias=INITIAL_BIAS;
375 firstSupplementaryIndex=1000000000;
376
377 /*
378 * Main decoding loop:
379 * Start just after the last delimiter if any
380 * basic code points were copied; start at the beginning otherwise.
381 */
382 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
383 /*
384 * in is the index of the next character to be consumed, and
385 * destCPCount is the number of code points in the output array.
386 *
387 * Decode a generalized variable-length integer into delta,
388 * which gets added to i. The overflow checking is easier
389 * if we increase i as we go, then subtract off its starting
390 * value at the end to obtain delta.
391 */
392 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
393 if(in>=srcLength) {
394 throw new ParseException("Illegal char found", -1);
395 }
396
397 digit=basicToDigit[(byte)src.charAt(in++)];
398 if(digit<0) {
399 throw new ParseException("Invalid char found", -1);
400 }
401 if(digit>(0x7fffffff-i)/w) {
402 /* integer overflow */
403 throw new ParseException("Illegal char found", -1);
404 }
405
406 i+=digit*w;
407 t=k-bias;
408 if(t<TMIN) {
409 t=TMIN;
410 } else if(k>=(bias+TMAX)) {
411 t=TMAX;
412 }
413 if(digit<t) {
414 break;
415 }
416
417 if(w>0x7fffffff/(BASE-t)) {
418 /* integer overflow */
419 throw new ParseException("Illegal char found", -1);
420 }
421 w*=BASE-t;
422 }
423
424 /*
425 * Modification from sample code:
426 * Increments destCPCount here,
427 * where needed instead of in for() loop tail.
428 */
429 ++destCPCount;
430 bias=adaptBias(i-oldi, destCPCount, (oldi==0));
431
432 /*
433 * i was supposed to wrap around from (incremented) destCPCount to 0,
434 * incrementing n each time, so we'll fix that now:
435 */
436 if(i/destCPCount>(0x7fffffff-n)) {
437 /* integer overflow */
438 throw new ParseException("Illegal char found", -1);
439 }
440
441 n+=i/destCPCount;
442 i%=destCPCount;
443 /* not needed for Punycode: */
444 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
445
446 if(n>0x10ffff || isSurrogate(n)) {
447 /* Unicode code point overflow */
448 throw new ParseException("Illegal char found", -1);
449 }
450
451 /* Insert n at position i of the output: */
452 cpLength=UTF16.getCharCount(n);
453 if((destLength+cpLength)<destCapacity) {
454 int codeUnitIndex;
455
456 /*
457 * Handle indexes when supplementary code points are present.
458 *
459 * In almost all cases, there will be only BMP code points before i
460 * and even in the entire string.
461 * This is handled with the same efficiency as with UTF-32.
462 *
463 * Only the rare cases with supplementary code points are handled
464 * more slowly - but not too bad since this is an insertion anyway.
465 */
466 if(i<=firstSupplementaryIndex) {
467 codeUnitIndex=i;
468 if(cpLength>1) {
469 firstSupplementaryIndex=codeUnitIndex;
470 } else {
471 ++firstSupplementaryIndex;
472 }
473 } else {
474 codeUnitIndex=firstSupplementaryIndex;
475 codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
476 }
477
478 /* use the UChar index codeUnitIndex instead of the code point index i */
479 if(codeUnitIndex<destLength) {
480 System.arraycopy(dest, codeUnitIndex,
481 dest, codeUnitIndex+cpLength,
482 (destLength-codeUnitIndex));
483 if(caseFlags!=null) {
484 System.arraycopy(caseFlags, codeUnitIndex,
485 caseFlags, codeUnitIndex+cpLength,
486 destLength-codeUnitIndex);
487 }
488 }
489 if(cpLength==1) {
490 /* BMP, insert one code unit */
491 dest[codeUnitIndex]=(char)n;
492 } else {
493 /* supplementary character, insert two code units */
494 dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
495 dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
496 }
497 if(caseFlags!=null) {
498 /* Case of last character determines uppercase flag: */
499 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
500 if(cpLength==2) {
501 caseFlags[codeUnitIndex+1]=false;
502 }
503 }
504 }
505 destLength+=cpLength;
506 ++i;
507 }
508 result.append(dest, 0, destLength);
509 return result;
510 }
511}