Blame - jdk/src/share/classes/java/net/URLEncoder.java - platform/libcore

blob: cea50dfa06910a35cb558dbb8bd115832b9466a3 [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* Copyright 1995-2006 Sun Microsystems, Inc. All Rights Reserved.
				3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				4	*
				5	* This code is free software; you can redistribute it and/or modify it
				6	* under the terms of the GNU General Public License version 2 only, as
				7	* published by the Free Software Foundation. Sun designates this
				8	* particular file as subject to the "Classpath" exception as provided
				9	* by Sun in the LICENSE file that accompanied this code.
				10	*
				11	* This code is distributed in the hope that it will be useful, but WITHOUT
				12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				14	* version 2 for more details (a copy is included in the LICENSE file that
				15	* accompanied this code).
				16	*
				17	* You should have received a copy of the GNU General Public License version
				18	* 2 along with this work; if not, write to the Free Software Foundation,
				19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				20	*
				21	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				22	* CA 95054 USA or visit www.sun.com if you need additional information or
				23	* have any questions.
				24	*/
				25
				26	package java.net;
				27
				28	import java.io.ByteArrayOutputStream;
				29	import java.io.BufferedWriter;
				30	import java.io.OutputStreamWriter;
				31	import java.io.IOException;
				32	import java.io.UnsupportedEncodingException;
				33	import java.io.CharArrayWriter;
				34	import java.nio.charset.Charset;
				35	import java.nio.charset.IllegalCharsetNameException;
				36	import java.nio.charset.UnsupportedCharsetException ;
				37	import java.util.BitSet;
				38	import java.security.AccessController;
				39	import java.security.PrivilegedAction;
				40	import sun.security.action.GetBooleanAction;
				41	import sun.security.action.GetPropertyAction;
				42
				43	/**
				44	* Utility class for HTML form encoding. This class contains static methods
				45	* for converting a String to the <CODE>application/x-www-form-urlencoded</CODE> MIME
				46	* format. For more information about HTML form encoding, consult the HTML
				47	* <A HREF="http://www.w3.org/TR/html4/">specification</A>.
				48	*
				49	* <p>
				50	* When encoding a String, the following rules apply:
				51	*
				52	* <p>
				53	* <ul>
				54	* <li>The alphanumeric characters "<code>a</code>" through
				55	* "<code>z</code>", "<code>A</code>" through
				56	* "<code>Z</code>" and "<code>0</code>"
				57	* through "<code>9</code>" remain the same.
				58	* <li>The special characters "<code>.</code>",
				59	* "<code>-</code>", "<code>*</code>", and
				60	* "<code>_</code>" remain the same.
				61	* <li>The space character "<code> </code>" is
				62	* converted into a plus sign "<code>+</code>".
				63	* <li>All other characters are unsafe and are first converted into
				64	* one or more bytes using some encoding scheme. Then each byte is
				65	* represented by the 3-character string
				66	* "<code>%<i>xy</i></code>", where <i>xy</i> is the
				67	* two-digit hexadecimal representation of the byte.
				68	* The recommended encoding scheme to use is UTF-8. However,
				69	* for compatibility reasons, if an encoding is not specified,
				70	* then the default encoding of the platform is used.
				71	* </ul>
				72	*
				73	* <p>
				74	* For example using UTF-8 as the encoding scheme the string "The
				75	* string ü@foo-bar" would get converted to
				76	* "The+string+%C3%BC%40foo-bar" because in UTF-8 the character
				77	* ü is encoded as two bytes C3 (hex) and BC (hex), and the
				78	* character @ is encoded as one byte 40 (hex).
				79	*
				80	* @author Herb Jellinek
				81	* @since JDK1.0
				82	*/
				83	public class URLEncoder {
				84	static BitSet dontNeedEncoding;
				85	static final int caseDiff = ('a' - 'A');
				86	static String dfltEncName = null;
				87
				88	static {
				89
				90	/* The list of characters that are not encoded has been
				91	* determined as follows:
				92	*
				93	* RFC 2396 states:
				94	* -----
				95	* Data characters that are allowed in a URI but do not have a
				96	* reserved purpose are called unreserved. These include upper
				97	* and lower case letters, decimal digits, and a limited set of
				98	* punctuation marks and symbols.
				99	*
				100	* unreserved = alphanum \| mark
				101	*
				102	* mark = "-" \| "_" \| "." \| "!" \| "~" \| "*" \| "'" \| "(" \| ")"
				103	*
				104	* Unreserved characters can be escaped without changing the
				105	* semantics of the URI, but this should not be done unless the
				106	* URI is being used in a context that does not allow the
				107	* unescaped character to appear.
				108	* -----
				109	*
				110	* It appears that both Netscape and Internet Explorer escape
				111	* all special characters from this list with the exception
				112	* of "-", "_", ".", "*". While it is not clear why they are
				113	* escaping the other characters, perhaps it is safest to
				114	* assume that there might be contexts in which the others
				115	* are unsafe if not escaped. Therefore, we will use the same
				116	* list. It is also noteworthy that this is consistent with
				117	* O'Reilly's "HTML: The Definitive Guide" (page 164).
				118	*
				119	* As a last note, Intenet Explorer does not encode the "@"
				120	* character which is clearly not unreserved according to the
				121	* RFC. We are being consistent with the RFC in this matter,
				122	* as is Netscape.
				123	*
				124	*/
				125
				126	dontNeedEncoding = new BitSet(256);
				127	int i;
				128	for (i = 'a'; i <= 'z'; i++) {
				129	dontNeedEncoding.set(i);
				130	}
				131	for (i = 'A'; i <= 'Z'; i++) {
				132	dontNeedEncoding.set(i);
				133	}
				134	for (i = '0'; i <= '9'; i++) {
				135	dontNeedEncoding.set(i);
				136	}
				137	dontNeedEncoding.set(' '); /* encoding a space to a + is done
				138	* in the encode() method */
				139	dontNeedEncoding.set('-');
				140	dontNeedEncoding.set('_');
				141	dontNeedEncoding.set('.');
				142	dontNeedEncoding.set('*');
				143
				144	dfltEncName = AccessController.doPrivileged(
				145	new GetPropertyAction("file.encoding")
				146	);
				147	}
				148
				149	/**
				150	* You can't call the constructor.
				151	*/
				152	private URLEncoder() { }
				153
				154	/**
				155	* Translates a string into <code>x-www-form-urlencoded</code>
				156	* format. This method uses the platform's default encoding
				157	* as the encoding scheme to obtain the bytes for unsafe characters.
				158	*
				159	* @param s <code>String</code> to be translated.
				160	* @deprecated The resulting string may vary depending on the platform's
				161	* default encoding. Instead, use the encode(String,String)
				162	* method to specify the encoding.
				163	* @return the translated <code>String</code>.
				164	*/
				165	@Deprecated
				166	public static String encode(String s) {
				167
				168	String str = null;
				169
				170	try {
				171	str = encode(s, dfltEncName);
				172	} catch (UnsupportedEncodingException e) {
				173	// The system should always have the platform default
				174	}
				175
				176	return str;
				177	}
				178
				179	/**
				180	* Translates a string into <code>application/x-www-form-urlencoded</code>
				181	* format using a specific encoding scheme. This method uses the
				182	* supplied encoding scheme to obtain the bytes for unsafe
				183	* characters.
				184	* <p>
				185	* <em><strong>Note:</strong> The <a href=
				186	* "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars">
				187	* World Wide Web Consortium Recommendation</a> states that
				188	* UTF-8 should be used. Not doing so may introduce
				189	* incompatibilites.</em>
				190	*
				191	* @param s <code>String</code> to be translated.
				192	* @param enc The name of a supported
				193	* <a href="../lang/package-summary.html#charenc">character
				194	* encoding</a>.
				195	* @return the translated <code>String</code>.
				196	* @exception UnsupportedEncodingException
				197	* If the named encoding is not supported
				198	* @see URLDecoder#decode(java.lang.String, java.lang.String)
				199	* @since 1.4
				200	*/
				201	public static String encode(String s, String enc)
				202	throws UnsupportedEncodingException {
				203
				204	boolean needToChange = false;
				205	StringBuffer out = new StringBuffer(s.length());
				206	Charset charset;
				207	CharArrayWriter charArrayWriter = new CharArrayWriter();
				208
				209	if (enc == null)
				210	throw new NullPointerException("charsetName");
				211
				212	try {
				213	charset = Charset.forName(enc);
				214	} catch (IllegalCharsetNameException e) {
				215	throw new UnsupportedEncodingException(enc);
				216	} catch (UnsupportedCharsetException e) {
				217	throw new UnsupportedEncodingException(enc);
				218	}
				219
				220	for (int i = 0; i < s.length();) {
				221	int c = (int) s.charAt(i);
				222	//System.out.println("Examining character: " + c);
				223	if (dontNeedEncoding.get(c)) {
				224	if (c == ' ') {
				225	c = '+';
				226	needToChange = true;
				227	}
				228	//System.out.println("Storing: " + c);
				229	out.append((char)c);
				230	i++;
				231	} else {
				232	// convert to external encoding before hex conversion
				233	do {
				234	charArrayWriter.write(c);
				235	/*
				236	* If this character represents the start of a Unicode
				237	* surrogate pair, then pass in two characters. It's not
				238	* clear what should be done if a bytes reserved in the
				239	* surrogate pairs range occurs outside of a legal
				240	* surrogate pair. For now, just treat it as if it were
				241	* any other character.
				242	*/
				243	if (c >= 0xD800 && c <= 0xDBFF) {
				244	/*
				245	System.out.println(Integer.toHexString(c)
				246	+ " is high surrogate");
				247	*/
				248	if ( (i+1) < s.length()) {
				249	int d = (int) s.charAt(i+1);
				250	/*
				251	System.out.println("\tExamining "
				252	+ Integer.toHexString(d));
				253	*/
				254	if (d >= 0xDC00 && d <= 0xDFFF) {
				255	/*
				256	System.out.println("\t"
				257	+ Integer.toHexString(d)
				258	+ " is low surrogate");
				259	*/
				260	charArrayWriter.write(d);
				261	i++;
				262	}
				263	}
				264	}
				265	i++;
				266	} while (i < s.length() && !dontNeedEncoding.get((c = (int) s.charAt(i))));
				267
				268	charArrayWriter.flush();
				269	String str = new String(charArrayWriter.toCharArray());
				270	byte[] ba = str.getBytes(charset);
				271	for (int j = 0; j < ba.length; j++) {
				272	out.append('%');
				273	char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16);
				274	// converting to use uppercase letter as part of
				275	// the hex value if ch is a letter.
				276	if (Character.isLetter(ch)) {
				277	ch -= caseDiff;
				278	}
				279	out.append(ch);
				280	ch = Character.forDigit(ba[j] & 0xF, 16);
				281	if (Character.isLetter(ch)) {
				282	ch -= caseDiff;
				283	}
				284	out.append(ch);
				285	}
				286	charArrayWriter.reset();
				287	needToChange = true;
				288	}
				289	}
				290
				291	return (needToChange? out.toString() : s);
				292	}
				293	}