Blame - jdk/src/share/classes/sun/text/normalizer/UTF16.java - platform/libcore

blob: 3b872a6063fe68bee273f8390fbb701e9706684a [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
				3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				4	*
				5	* This code is free software; you can redistribute it and/or modify it
				6	* under the terms of the GNU General Public License version 2 only, as
				7	* published by the Free Software Foundation. Sun designates this
				8	* particular file as subject to the "Classpath" exception as provided
				9	* by Sun in the LICENSE file that accompanied this code.
				10	*
				11	* This code is distributed in the hope that it will be useful, but WITHOUT
				12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				14	* version 2 for more details (a copy is included in the LICENSE file that
				15	* accompanied this code).
				16	*
				17	* You should have received a copy of the GNU General Public License version
				18	* 2 along with this work; if not, write to the Free Software Foundation,
				19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				20	*
				21	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				22	* CA 95054 USA or visit www.sun.com if you need additional information or
				23	* have any questions.
				24	*/
				25
				26	/*
				27	*******************************************************************************
				28	* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
				29	* *
				30	* The original version of this source code and documentation is copyrighted *
				31	* and owned by IBM, These materials are provided under terms of a License *
				32	* Agreement between IBM and Sun. This technology is protected by multiple *
				33	* US and International patents. This notice and attribution to IBM may not *
				34	* to removed. *
				35	*******************************************************************************
				36	*/
				37
				38	package sun.text.normalizer;
				39
				40	/**
				41	* <p>Standalone utility class providing UTF16 character conversions and
				42	* indexing conversions.</p>
				43	* <p>Code that uses strings alone rarely need modification.
				44	* By design, UTF-16 does not allow overlap, so searching for strings is a safe
				45	* operation. Similarly, concatenation is always safe. Substringing is safe if
				46	* the start and end are both on UTF-32 boundaries. In normal code, the values
				47	* for start and end are on those boundaries, since they arose from operations
				48	* like searching. If not, the nearest UTF-32 boundaries can be determined
				49	* using <code>bounds()</code>.</p>
				50	* <strong>Examples:</strong>
				51	* <p>The following examples illustrate use of some of these methods.
				52	* <pre>
				53	* // iteration forwards: Original
				54	* for (int i = 0; i < s.length(); ++i) {
				55	* char ch = s.charAt(i);
				56	* doSomethingWith(ch);
				57	* }
				58	*
				59	* // iteration forwards: Changes for UTF-32
				60	* int ch;
				61	* for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) {
				62	* ch = UTF16.charAt(s,i);
				63	* doSomethingWith(ch);
				64	* }
				65	*
				66	* // iteration backwards: Original
				67	* for (int i = s.length() -1; i >= 0; --i) {
				68	* char ch = s.charAt(i);
				69	* doSomethingWith(ch);
				70	* }
				71	*
				72	* // iteration backwards: Changes for UTF-32
				73	* int ch;
				74	* for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
				75	* ch = UTF16.charAt(s,i);
				76	* doSomethingWith(ch);
				77	* }
				78	* </pre>
				79	* <strong>Notes:</strong>
				80	* <ul>
				81	* <li>
				82	* <strong>Naming:</strong> For clarity, High and Low surrogates are called
				83	* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
				84	* sense of their ordering in a string. <code>offset16</code> and
				85	* <code>offset32</code> are used to distinguish offsets to UTF-16
				86	* boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
				87	* used to contain UTF-32 characters, as opposed to <code>char16</code>,
				88	* which is a UTF-16 code unit.
				89	* </li>
				90	* <li>
				91	* <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
				92	* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
				93	* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
				94	* back if and only if <code>bounds(string, offset16) != TRAIL</code>.
				95	* </li>
				96	* <li>
				97	* <strong>Exceptions:</strong> The error checking will throw an exception
				98	* if indices are out of bounds. Other than than that, all methods will
				99	* behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
				100	* values are present. <code>UCharacter.isLegal()</code> can be used to check
				101	* for validity if desired.
				102	* </li>
				103	* <li>
				104	* <strong>Unmatched Surrogates:</strong> If the string contains unmatched
				105	* surrogates, then these are counted as one UTF-32 value. This matches
				106	* their iteration behavior, which is vital. It also matches common display
				107	* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
				108	* </li>
				109	* <li>
				110	* <strong>Optimization:</strong> The method implementations may need
				111	* optimization if the compiler doesn't fold static final methods. Since
				112	* surrogate pairs will form an exceeding small percentage of all the text
				113	* in the world, the singleton case should always be optimized for.
				114	* </li>
				115	* </ul>
				116	* @author Mark Davis, with help from Markus Scherer
				117	* @stable ICU 2.1
				118	*/
				119
				120	public final class UTF16
				121	{
				122	// public variables ---------------------------------------------------
				123
				124	/**
				125	* The lowest Unicode code point value.
				126	* @stable ICU 2.1
				127	*/
				128	public static final int CODEPOINT_MIN_VALUE = 0;
				129	/**
				130	* The highest Unicode code point value (scalar value) according to the
				131	* Unicode Standard.
				132	* @stable ICU 2.1
				133	*/
				134	public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
				135	/**
				136	* The minimum value for Supplementary code points
				137	* @stable ICU 2.1
				138	*/
				139	public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
				140	/**
				141	* Lead surrogate minimum value
				142	* @stable ICU 2.1
				143	*/
				144	public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
				145	/**
				146	* Trail surrogate minimum value
				147	* @stable ICU 2.1
				148	*/
				149	public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
				150	/**
				151	* Lead surrogate maximum value
				152	* @stable ICU 2.1
				153	*/
				154	public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
				155	/**
				156	* Trail surrogate maximum value
				157	* @stable ICU 2.1
				158	*/
				159	public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
				160	/**
				161	* Surrogate minimum value
				162	* @stable ICU 2.1
				163	*/
				164	public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
				165
				166	// public method ------------------------------------------------------
				167
				168	/**
				169	* Extract a single UTF-32 value from a string.
				170	* Used when iterating forwards or backwards (with
				171	* <code>UTF16.getCharCount()</code>, as well as random access. If a
				172	* validity check is required, use
				173	* <code><a href="../lang/UCharacter.html#isLegal(char)">
				174	* UCharacter.isLegal()</a></code> on the return value.
				175	* If the char retrieved is part of a surrogate pair, its supplementary
				176	* character will be returned. If a complete supplementary character is
				177	* not found the incomplete character will be returned
				178	* @param source array of UTF-16 chars
				179	* @param offset16 UTF-16 offset to the start of the character.
				180	* @return UTF-32 value for the UTF-32 value that contains the char at
				181	* offset16. The boundaries of that codepoint are the same as in
				182	* <code>bounds32()</code>.
				183	* @exception IndexOutOfBoundsException thrown if offset16 is out of
				184	* bounds.
				185	* @stable ICU 2.1
				186	*/
				187	public static int charAt(String source, int offset16)
				188	{
				189	if (offset16 < 0 \|\| offset16 >= source.length()) {
				190	throw new StringIndexOutOfBoundsException(offset16);
				191	}
				192
				193	char single = source.charAt(offset16);
				194	if (single < LEAD_SURROGATE_MIN_VALUE \|\|
				195	single > TRAIL_SURROGATE_MAX_VALUE) {
				196	return single;
				197	}
				198
				199	// Convert the UTF-16 surrogate pair if necessary.
				200	// For simplicity in usage, and because the frequency of pairs is
				201	// low, look both directions.
				202
				203	if (single <= LEAD_SURROGATE_MAX_VALUE) {
				204	++ offset16;
				205	if (source.length() != offset16) {
				206	char trail = source.charAt(offset16);
				207	if (trail >= TRAIL_SURROGATE_MIN_VALUE &&
				208	trail <= TRAIL_SURROGATE_MAX_VALUE) {
				209	return UCharacterProperty.getRawSupplementary(single,
				210	trail);
				211	}
				212	}
				213	}
				214	else
				215	{
				216	-- offset16;
				217	if (offset16 >= 0) {
				218	// single is a trail surrogate so
				219	char lead = source.charAt(offset16);
				220	if (lead >= LEAD_SURROGATE_MIN_VALUE &&
				221	lead <= LEAD_SURROGATE_MAX_VALUE) {
				222	return UCharacterProperty.getRawSupplementary(lead,
				223	single);
				224	}
				225	}
				226	}
				227	return single; // return unmatched surrogate
				228	}
				229
				230	/**
				231	* Extract a single UTF-32 value from a substring.
				232	* Used when iterating forwards or backwards (with
				233	* <code>UTF16.getCharCount()</code>, as well as random access. If a
				234	* validity check is required, use
				235	* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
				236	* </a></code> on the return value.
				237	* If the char retrieved is part of a surrogate pair, its supplementary
				238	* character will be returned. If a complete supplementary character is
				239	* not found the incomplete character will be returned
				240	* @param source array of UTF-16 chars
				241	* @param start offset to substring in the source array for analyzing
				242	* @param limit offset to substring in the source array for analyzing
				243	* @param offset16 UTF-16 offset relative to start
				244	* @return UTF-32 value for the UTF-32 value that contains the char at
				245	* offset16. The boundaries of that codepoint are the same as in
				246	* <code>bounds32()</code>.
				247	* @exception IndexOutOfBoundsException thrown if offset16 is not within
				248	* the range of start and limit.
				249	* @stable ICU 2.1
				250	*/
				251	public static int charAt(char source[], int start, int limit,
				252	int offset16)
				253	{
				254	offset16 += start;
				255	if (offset16 < start \|\| offset16 >= limit) {
				256	throw new ArrayIndexOutOfBoundsException(offset16);
				257	}
				258
				259	char single = source[offset16];
				260	if (!isSurrogate(single)) {
				261	return single;
				262	}
				263
				264	// Convert the UTF-16 surrogate pair if necessary.
				265	// For simplicity in usage, and because the frequency of pairs is
				266	// low, look both directions.
				267	if (single <= LEAD_SURROGATE_MAX_VALUE) {
				268	offset16 ++;
				269	if (offset16 >= limit) {
				270	return single;
				271	}
				272	char trail = source[offset16];
				273	if (isTrailSurrogate(trail)) {
				274	return UCharacterProperty.getRawSupplementary(single, trail);
				275	}
				276	}
				277	else { // isTrailSurrogate(single), so
				278	if (offset16 == start) {
				279	return single;
				280	}
				281	offset16 --;
				282	char lead = source[offset16];
				283	if (isLeadSurrogate(lead))
				284	return UCharacterProperty.getRawSupplementary(lead, single);
				285	}
				286	return single; // return unmatched surrogate
				287	}
				288
				289	/**
				290	* Determines how many chars this char32 requires.
				291	* If a validity check is required, use <code>
				292	* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
				293	* char32 before calling.
				294	* @param char32 the input codepoint.
				295	* @return 2 if is in supplementary space, otherwise 1.
				296	* @stable ICU 2.1
				297	*/
				298	public static int getCharCount(int char32)
				299	{
				300	if (char32 < SUPPLEMENTARY_MIN_VALUE) {
				301	return 1;
				302	}
				303	return 2;
				304	}
				305
				306	/**
				307	* Determines whether the code value is a surrogate.
				308	* @param char16 the input character.
				309	* @return true iff the input character is a surrogate.
				310	* @stable ICU 2.1
				311	*/
				312	public static boolean isSurrogate(char char16)
				313	{
				314	return LEAD_SURROGATE_MIN_VALUE <= char16 &&
				315	char16 <= TRAIL_SURROGATE_MAX_VALUE;
				316	}
				317
				318	/**
				319	* Determines whether the character is a trail surrogate.
				320	* @param char16 the input character.
				321	* @return true iff the input character is a trail surrogate.
				322	* @stable ICU 2.1
				323	*/
				324	public static boolean isTrailSurrogate(char char16)
				325	{
				326	return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
				327	char16 <= TRAIL_SURROGATE_MAX_VALUE);
				328	}
				329
				330	/**
				331	* Determines whether the character is a lead surrogate.
				332	* @param char16 the input character.
				333	* @return true iff the input character is a lead surrogate
				334	* @stable ICU 2.1
				335	*/
				336	public static boolean isLeadSurrogate(char char16)
				337	{
				338	return LEAD_SURROGATE_MIN_VALUE <= char16 &&
				339	char16 <= LEAD_SURROGATE_MAX_VALUE;
				340	}
				341
				342	/**
				343	* Returns the lead surrogate.
				344	* If a validity check is required, use
				345	* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
				346	* on char32 before calling.
				347	* @param char32 the input character.
				348	* @return lead surrogate if the getCharCount(ch) is 2; <br>
				349	* and 0 otherwise (note: 0 is not a valid lead surrogate).
				350	* @stable ICU 2.1
				351	*/
				352	public static char getLeadSurrogate(int char32)
				353	{
				354	if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
				355	return (char)(LEAD_SURROGATE_OFFSET_ +
				356	(char32 >> LEAD_SURROGATE_SHIFT_));
				357	}
				358
				359	return 0;
				360	}
				361
				362	/**
				363	* Returns the trail surrogate.
				364	* If a validity check is required, use
				365	* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
				366	* on char32 before calling.
				367	* @param char32 the input character.
				368	* @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
				369	* the character itself
				370	* @stable ICU 2.1
				371	*/
				372	public static char getTrailSurrogate(int char32)
				373	{
				374	if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
				375	return (char)(TRAIL_SURROGATE_MIN_VALUE +
				376	(char32 & TRAIL_SURROGATE_MASK_));
				377	}
				378
				379	return (char)char32;
				380	}
				381
				382	/**
				383	* Convenience method corresponding to String.valueOf(char). Returns a one
				384	* or two char string containing the UTF-32 value in UTF16 format. If a
				385	* validity check is required, use
				386	* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
				387	* on char32 before calling.
				388	* @param char32 the input character.
				389	* @return string value of char32 in UTF16 format
				390	* @exception IllegalArgumentException thrown if char32 is a invalid
				391	* codepoint.
				392	* @stable ICU 2.1
				393	*/
				394	public static String valueOf(int char32)
				395	{
				396	if (char32 < CODEPOINT_MIN_VALUE \|\| char32 > CODEPOINT_MAX_VALUE) {
				397	throw new IllegalArgumentException("Illegal codepoint");
				398	}
				399	return toString(char32);
				400	}
				401
				402	/**
				403	* Append a single UTF-32 value to the end of a StringBuffer.
				404	* If a validity check is required, use
				405	* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
				406	* on char32 before calling.
				407	* @param target the buffer to append to
				408	* @param char32 value to append.
				409	* @return the updated StringBuffer
				410	* @exception IllegalArgumentException thrown when char32 does not lie
				411	* within the range of the Unicode codepoints
				412	* @stable ICU 2.1
				413	*/
				414	public static StringBuffer append(StringBuffer target, int char32)
				415	{
				416	// Check for irregular values
				417	if (char32 < CODEPOINT_MIN_VALUE \|\| char32 > CODEPOINT_MAX_VALUE) {
				418	throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
				419	}
				420
				421	// Write the UTF-16 values
				422	if (char32 >= SUPPLEMENTARY_MIN_VALUE)
				423	{
				424	target.append(getLeadSurrogate(char32));
				425	target.append(getTrailSurrogate(char32));
				426	}
				427	else {
				428	target.append((char)char32);
				429	}
				430	return target;
				431	}
				432
				433	//// for StringPrep
				434	/**
				435	* Shifts offset16 by the argument number of codepoints within a subarray.
				436	* @param source char array
				437	* @param start position of the subarray to be performed on
				438	* @param limit position of the subarray to be performed on
				439	* @param offset16 UTF16 position to shift relative to start
				440	* @param shift32 number of codepoints to shift
				441	* @return new shifted offset16 relative to start
				442	* @exception IndexOutOfBoundsException if the new offset16 is out of
				443	* bounds with respect to the subarray or the subarray bounds
				444	* are out of range.
				445	* @stable ICU 2.1
				446	*/
				447	public static int moveCodePointOffset(char source[], int start, int limit,
				448	int offset16, int shift32)
				449	{
				450	int size = source.length;
				451	int count;
				452	char ch;
				453	int result = offset16 + start;
				454	if (start<0 \|\| limit<start) {
				455	throw new StringIndexOutOfBoundsException(start);
				456	}
				457	if (limit>size) {
				458	throw new StringIndexOutOfBoundsException(limit);
				459	}
				460	if (offset16<0 \|\| result>limit) {
				461	throw new StringIndexOutOfBoundsException(offset16);
				462	}
				463	if (shift32 > 0 ) {
				464	if (shift32 + result > size) {
				465	throw new StringIndexOutOfBoundsException(result);
				466	}
				467	count = shift32;
				468	while (result < limit && count > 0)
				469	{
				470	ch = source[result];
				471	if (isLeadSurrogate(ch) && (result+1 < limit) &&
				472	isTrailSurrogate(source[result+1])) {
				473	result ++;
				474	}
				475	count --;
				476	result ++;
				477	}
				478	} else {
				479	if (result + shift32 < start) {
				480	throw new StringIndexOutOfBoundsException(result);
				481	}
				482	for (count=-shift32; count>0; count--) {
				483	result--;
				484	if (result<start) {
				485	break;
				486	}
				487	ch = source[result];
				488	if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
				489	result--;
				490	}
				491	}
				492	}
				493	if (count != 0) {
				494	throw new StringIndexOutOfBoundsException(shift32);
				495	}
				496	result -= start;
				497	return result;
				498	}
				499
				500	// private data members -------------------------------------------------
				501
				502	/**
				503	* Shift value for lead surrogate to form a supplementary character.
				504	*/
				505	private static final int LEAD_SURROGATE_SHIFT_ = 10;
				506
				507	/**
				508	* Mask to retrieve the significant value from a trail surrogate.
				509	*/
				510	private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
				511
				512	/**
				513	* Value that all lead surrogate starts with
				514	*/
				515	private static final int LEAD_SURROGATE_OFFSET_ =
				516	LEAD_SURROGATE_MIN_VALUE -
				517	(SUPPLEMENTARY_MIN_VALUE
				518	>> LEAD_SURROGATE_SHIFT_);
				519
				520	// private methods ------------------------------------------------------
				521
				522	/**
				523	* <p>Converts argument code point and returns a String object representing
				524	* the code point's value in UTF16 format.</p>
				525	* <p>This method does not check for the validity of the codepoint, the
				526	* results are not guaranteed if a invalid codepoint is passed as
				527	* argument.</p>
				528	* <p>The result is a string whose length is 1 for non-supplementary code
				529	* points, 2 otherwise.</p>
				530	* @param ch code point
				531	* @return string representation of the code point
				532	*/
				533	private static String toString(int ch)
				534	{
				535	if (ch < SUPPLEMENTARY_MIN_VALUE) {
				536	return String.valueOf((char)ch);
				537	}
				538
				539	StringBuffer result = new StringBuffer();
				540	result.append(getLeadSurrogate(ch));
				541	result.append(getTrailSurrogate(ch));
				542	return result.toString();
				543	}
				544	}