Blame - jdk/src/share/classes/sun/text/normalizer/Utility.java - platform/libcore

blob: 895097f13d25566434db7a558b12499b16f61fbb [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
				3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				4	*
				5	* This code is free software; you can redistribute it and/or modify it
				6	* under the terms of the GNU General Public License version 2 only, as
				7	* published by the Free Software Foundation. Sun designates this
				8	* particular file as subject to the "Classpath" exception as provided
				9	* by Sun in the LICENSE file that accompanied this code.
				10	*
				11	* This code is distributed in the hope that it will be useful, but WITHOUT
				12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				14	* version 2 for more details (a copy is included in the LICENSE file that
				15	* accompanied this code).
				16	*
				17	* You should have received a copy of the GNU General Public License version
				18	* 2 along with this work; if not, write to the Free Software Foundation,
				19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				20	*
				21	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				22	* CA 95054 USA or visit www.sun.com if you need additional information or
				23	* have any questions.
				24	*/
				25	/*
				26	*******************************************************************************
				27	* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
				28	* *
				29	* The original version of this source code and documentation is copyrighted *
				30	* and owned by IBM, These materials are provided under terms of a License *
				31	* Agreement between IBM and Sun. This technology is protected by multiple *
				32	* US and International patents. This notice and attribution to IBM may not *
				33	* to removed. *
				34	*******************************************************************************
				35	*/
				36
				37	package sun.text.normalizer;
				38
				39	// This class contains utility functions so testing not needed
				40	///CLOVER:OFF
				41	public final class Utility {
				42
				43	/**
				44	* Convert characters outside the range U+0020 to U+007F to
				45	* Unicode escapes, and convert backslash to a double backslash.
				46	*/
				47	public static final String escape(String s) {
				48	StringBuffer buf = new StringBuffer();
				49	for (int i=0; i<s.length(); ) {
				50	int c = UTF16.charAt(s, i);
				51	i += UTF16.getCharCount(c);
				52	if (c >= ' ' && c <= 0x007F) {
				53	if (c == '\\') {
				54	buf.append("\\\\"); // That is, "\\"
				55	} else {
				56	buf.append((char)c);
				57	}
				58	} else {
				59	boolean four = c <= 0xFFFF;
				60	buf.append(four ? "\\u" : "\\U");
				61	hex(c, four ? 4 : 8, buf);
				62	}
				63	}
				64	return buf.toString();
				65	}
				66
				67	/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
				68	static private final char[] UNESCAPE_MAP = {
				69	/" 0x22, 0x22 /
				70	/' 0x27, 0x27 /
				71	/? 0x3F, 0x3F /
				72	/\ 0x5C, 0x5C /
				73	/a/ 0x61, 0x07,
				74	/b/ 0x62, 0x08,
				75	/e/ 0x65, 0x1b,
				76	/f/ 0x66, 0x0c,
				77	/n/ 0x6E, 0x0a,
				78	/r/ 0x72, 0x0d,
				79	/t/ 0x74, 0x09,
				80	/v/ 0x76, 0x0b
				81	};
				82
				83	/**
				84	* Convert an escape to a 32-bit code point value. We attempt
				85	* to parallel the icu4c unescapeAt() function.
				86	* @param offset16 an array containing offset to the character
				87	* <em>after</em> the backslash. Upon return offset16[0] will
				88	* be updated to point after the escape sequence.
				89	* @return character value from 0 to 10FFFF, or -1 on error.
				90	*/
				91	public static int unescapeAt(String s, int[] offset16) {
				92	int c;
				93	int result = 0;
				94	int n = 0;
				95	int minDig = 0;
				96	int maxDig = 0;
				97	int bitsPerDigit = 4;
				98	int dig;
				99	int i;
				100	boolean braces = false;
				101
				102	/* Check that offset is in range */
				103	int offset = offset16[0];
				104	int length = s.length();
				105	if (offset < 0 \|\| offset >= length) {
				106	return -1;
				107	}
				108
				109	/* Fetch first UChar after '\\' */
				110	c = UTF16.charAt(s, offset);
				111	offset += UTF16.getCharCount(c);
				112
				113	/* Convert hexadecimal and octal escapes */
				114	switch (c) {
				115	case 'u':
				116	minDig = maxDig = 4;
				117	break;
				118	case 'U':
				119	minDig = maxDig = 8;
				120	break;
				121	case 'x':
				122	minDig = 1;
				123	if (offset < length && UTF16.charAt(s, offset) == 0x7B /{/) {
				124	++offset;
				125	braces = true;
				126	maxDig = 8;
				127	} else {
				128	maxDig = 2;
				129	}
				130	break;
				131	default:
				132	dig = UCharacter.digit(c, 8);
				133	if (dig >= 0) {
				134	minDig = 1;
				135	maxDig = 3;
				136	n = 1; /* Already have first octal digit */
				137	bitsPerDigit = 3;
				138	result = dig;
				139	}
				140	break;
				141	}
				142	if (minDig != 0) {
				143	while (offset < length && n < maxDig) {
				144	c = UTF16.charAt(s, offset);
				145	dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
				146	if (dig < 0) {
				147	break;
				148	}
				149	result = (result << bitsPerDigit) \| dig;
				150	offset += UTF16.getCharCount(c);
				151	++n;
				152	}
				153	if (n < minDig) {
				154	return -1;
				155	}
				156	if (braces) {
				157	if (c != 0x7D /}/) {
				158	return -1;
				159	}
				160	++offset;
				161	}
				162	if (result < 0 \|\| result >= 0x110000) {
				163	return -1;
				164	}
				165	// If an escape sequence specifies a lead surrogate, see
				166	// if there is a trail surrogate after it, either as an
				167	// escape or as a literal. If so, join them up into a
				168	// supplementary.
				169	if (offset < length &&
				170	UTF16.isLeadSurrogate((char) result)) {
				171	int ahead = offset+1;
				172	c = s.charAt(offset); // [sic] get 16-bit code unit
				173	if (c == '\\' && ahead < length) {
				174	int o[] = new int[] { ahead };
				175	c = unescapeAt(s, o);
				176	ahead = o[0];
				177	}
				178	if (UTF16.isTrailSurrogate((char) c)) {
				179	offset = ahead;
				180	result = UCharacterProperty.getRawSupplementary(
				181	(char) result, (char) c);
				182	}
				183	}
				184	offset16[0] = offset;
				185	return result;
				186	}
				187
				188	/* Convert C-style escapes in table */
				189	for (i=0; i<UNESCAPE_MAP.length; i+=2) {
				190	if (c == UNESCAPE_MAP[i]) {
				191	offset16[0] = offset;
				192	return UNESCAPE_MAP[i+1];
				193	} else if (c < UNESCAPE_MAP[i]) {
				194	break;
				195	}
				196	}
				197
				198	/* Map \cX to control-X: X & 0x1F */
				199	if (c == 'c' && offset < length) {
				200	c = UTF16.charAt(s, offset);
				201	offset16[0] = offset + UTF16.getCharCount(c);
				202	return 0x1F & c;
				203	}
				204
				205	/* If no special forms are recognized, then consider
				206	* the backslash to generically escape the next character. */
				207	offset16[0] = offset;
				208	return c;
				209	}
				210
				211	/**
				212	* Convert a integer to size width hex uppercase digits.
				213	* E.g., hex('a', 4, str) => "0041".
				214	* Append the output to the given StringBuffer.
				215	* If width is too small to fit, nothing will be appended to output.
				216	*/
				217	public static StringBuffer hex(int ch, int width, StringBuffer output) {
				218	return appendNumber(output, ch, 16, width);
				219	}
				220
				221	/**
				222	* Convert a integer to size width (minimum) hex uppercase digits.
				223	* E.g., hex('a', 4, str) => "0041". If the integer requires more
				224	* than width digits, more will be used.
				225	*/
				226	public static String hex(int ch, int width) {
				227	StringBuffer buf = new StringBuffer();
				228	return appendNumber(buf, ch, 16, width).toString();
				229	}
				230
				231	/**
				232	* Skip over a sequence of zero or more white space characters
				233	* at pos. Return the index of the first non-white-space character
				234	* at or after pos, or str.length(), if there is none.
				235	*/
				236	public static int skipWhitespace(String str, int pos) {
				237	while (pos < str.length()) {
				238	int c = UTF16.charAt(str, pos);
				239	if (!UCharacterProperty.isRuleWhiteSpace(c)) {
				240	break;
				241	}
				242	pos += UTF16.getCharCount(c);
				243	}
				244	return pos;
				245	}
				246
				247	static final char DIGITS[] = {
				248	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
				249	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
				250	'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
				251	'U', 'V', 'W', 'X', 'Y', 'Z'
				252	};
				253
				254	/**
				255	* Append the digits of a positive integer to the given
				256	* <code>StringBuffer</code> in the given radix. This is
				257	* done recursively since it is easiest to generate the low-
				258	* order digit first, but it must be appended last.
				259	*
				260	* @param result is the <code>StringBuffer</code> to append to
				261	* @param n is the positive integer
				262	* @param radix is the radix, from 2 to 36 inclusive
				263	* @param minDigits is the minimum number of digits to append.
				264	*/
				265	private static void recursiveAppendNumber(StringBuffer result, int n,
				266	int radix, int minDigits)
				267	{
				268	int digit = n % radix;
				269
				270	if (n >= radix \|\| minDigits > 1) {
				271	recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
				272	}
				273
				274	result.append(DIGITS[digit]);
				275	}
				276
				277	/**
				278	* Append a number to the given StringBuffer in the given radix.
				279	* Standard digits '0'-'9' are used and letters 'A'-'Z' for
				280	* radices 11 through 36.
				281	* @param result the digits of the number are appended here
				282	* @param n the number to be converted to digits; may be negative.
				283	* If negative, a '-' is prepended to the digits.
				284	* @param radix a radix from 2 to 36 inclusive.
				285	* @param minDigits the minimum number of digits, not including
				286	* any '-', to produce. Values less than 2 have no effect. One
				287	* digit is always emitted regardless of this parameter.
				288	* @return a reference to result
				289	*/
				290	public static StringBuffer appendNumber(StringBuffer result, int n,
				291	int radix, int minDigits)
				292	throws IllegalArgumentException
				293	{
				294	if (radix < 2 \|\| radix > 36) {
				295	throw new IllegalArgumentException("Illegal radix " + radix);
				296	}
				297
				298
				299	int abs = n;
				300
				301	if (n < 0) {
				302	abs = -n;
				303	result.append("-");
				304	}
				305
				306	recursiveAppendNumber(result, abs, radix, minDigits);
				307
				308	return result;
				309	}
				310
				311	/**
				312	* Return true if the character is NOT printable ASCII. The tab,
				313	* newline and linefeed characters are considered unprintable.
				314	*/
				315	public static boolean isUnprintable(int c) {
				316	return !(c >= 0x20 && c <= 0x7E);
				317	}
				318
				319	/**
				320	* Escape unprintable characters using <backslash>uxxxx notation
				321	* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
				322	* above. If the character is printable ASCII, then do nothing
				323	* and return FALSE. Otherwise, append the escaped notation and
				324	* return TRUE.
				325	*/
				326	public static boolean escapeUnprintable(StringBuffer result, int c) {
				327	if (isUnprintable(c)) {
				328	result.append('\\');
				329	if ((c & ~0xFFFF) != 0) {
				330	result.append('U');
				331	result.append(DIGITS[0xF&(c>>28)]);
				332	result.append(DIGITS[0xF&(c>>24)]);
				333	result.append(DIGITS[0xF&(c>>20)]);
				334	result.append(DIGITS[0xF&(c>>16)]);
				335	} else {
				336	result.append('u');
				337	}
				338	result.append(DIGITS[0xF&(c>>12)]);
				339	result.append(DIGITS[0xF&(c>>8)]);
				340	result.append(DIGITS[0xF&(c>>4)]);
				341	result.append(DIGITS[0xF&c]);
				342	return true;
				343	}
				344	return false;
				345	}
				346
				347	//// for StringPrep
				348	/**
				349	* Similar to StringBuffer.getChars, version 1.3.
				350	* Since JDK 1.2 implements StringBuffer.getChars differently, this method
				351	* is here to provide consistent results.
				352	* To be removed after JDK 1.2 ceased to be the reference platform.
				353	* @param src source string buffer
				354	* @param srcBegin offset to the start of the src to retrieve from
				355	* @param srcEnd offset to the end of the src to retrieve from
				356	* @param dst char array to store the retrieved chars
				357	* @param dstBegin offset to the start of the destination char array to
				358	* store the retrieved chars
				359	* @draft since ICU4J 2.0
				360	*/
				361	public static void getChars(StringBuffer src, int srcBegin, int srcEnd,
				362	char dst[], int dstBegin)
				363	{
				364	if (srcBegin == srcEnd) {
				365	return;
				366	}
				367	src.getChars(srcBegin, srcEnd, dst, dstBegin);
				368	}
				369
				370	/**
				371	* Convenience utility to compare two char[]s.
				372	* @param len the length to compare.
				373	* The start indices and start+len must be valid.
				374	*/
				375	public final static boolean arrayRegionMatches(char[] source, int sourceStart,
				376	char[] target, int targetStart,
				377	int len)
				378	{
				379	int sourceEnd = sourceStart + len;
				380	int delta = targetStart - sourceStart;
				381	for (int i = sourceStart; i < sourceEnd; i++) {
				382	if (source[i] != target[i + delta])
				383	return false;
				384	}
				385	return true;
				386	}
				387
				388	}
				389	///CLOVER:ON