J. Duke | 319a3b9 | 2007-12-01 00:00:00 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright 1995-2006 Sun Microsystems, Inc. All Rights Reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. Sun designates this |
| 8 | * particular file as subject to the "Classpath" exception as provided |
| 9 | * by Sun in the LICENSE file that accompanied this code. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 22 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 23 | * have any questions. |
| 24 | */ |
| 25 | |
| 26 | package java.net; |
| 27 | |
| 28 | import java.io.ByteArrayOutputStream; |
| 29 | import java.io.BufferedWriter; |
| 30 | import java.io.OutputStreamWriter; |
| 31 | import java.io.IOException; |
| 32 | import java.io.UnsupportedEncodingException; |
| 33 | import java.io.CharArrayWriter; |
| 34 | import java.nio.charset.Charset; |
| 35 | import java.nio.charset.IllegalCharsetNameException; |
| 36 | import java.nio.charset.UnsupportedCharsetException ; |
| 37 | import java.util.BitSet; |
| 38 | import java.security.AccessController; |
| 39 | import java.security.PrivilegedAction; |
| 40 | import sun.security.action.GetBooleanAction; |
| 41 | import sun.security.action.GetPropertyAction; |
| 42 | |
| 43 | /** |
| 44 | * Utility class for HTML form encoding. This class contains static methods |
| 45 | * for converting a String to the <CODE>application/x-www-form-urlencoded</CODE> MIME |
| 46 | * format. For more information about HTML form encoding, consult the HTML |
| 47 | * <A HREF="http://www.w3.org/TR/html4/">specification</A>. |
| 48 | * |
| 49 | * <p> |
| 50 | * When encoding a String, the following rules apply: |
| 51 | * |
| 52 | * <p> |
| 53 | * <ul> |
| 54 | * <li>The alphanumeric characters "<code>a</code>" through |
| 55 | * "<code>z</code>", "<code>A</code>" through |
| 56 | * "<code>Z</code>" and "<code>0</code>" |
| 57 | * through "<code>9</code>" remain the same. |
| 58 | * <li>The special characters "<code>.</code>", |
| 59 | * "<code>-</code>", "<code>*</code>", and |
| 60 | * "<code>_</code>" remain the same. |
| 61 | * <li>The space character "<code> </code>" is |
| 62 | * converted into a plus sign "<code>+</code>". |
| 63 | * <li>All other characters are unsafe and are first converted into |
| 64 | * one or more bytes using some encoding scheme. Then each byte is |
| 65 | * represented by the 3-character string |
| 66 | * "<code>%<i>xy</i></code>", where <i>xy</i> is the |
| 67 | * two-digit hexadecimal representation of the byte. |
| 68 | * The recommended encoding scheme to use is UTF-8. However, |
| 69 | * for compatibility reasons, if an encoding is not specified, |
| 70 | * then the default encoding of the platform is used. |
| 71 | * </ul> |
| 72 | * |
| 73 | * <p> |
| 74 | * For example using UTF-8 as the encoding scheme the string "The |
| 75 | * string ü@foo-bar" would get converted to |
| 76 | * "The+string+%C3%BC%40foo-bar" because in UTF-8 the character |
| 77 | * ü is encoded as two bytes C3 (hex) and BC (hex), and the |
| 78 | * character @ is encoded as one byte 40 (hex). |
| 79 | * |
| 80 | * @author Herb Jellinek |
| 81 | * @since JDK1.0 |
| 82 | */ |
| 83 | public class URLEncoder { |
| 84 | static BitSet dontNeedEncoding; |
| 85 | static final int caseDiff = ('a' - 'A'); |
| 86 | static String dfltEncName = null; |
| 87 | |
| 88 | static { |
| 89 | |
| 90 | /* The list of characters that are not encoded has been |
| 91 | * determined as follows: |
| 92 | * |
| 93 | * RFC 2396 states: |
| 94 | * ----- |
| 95 | * Data characters that are allowed in a URI but do not have a |
| 96 | * reserved purpose are called unreserved. These include upper |
| 97 | * and lower case letters, decimal digits, and a limited set of |
| 98 | * punctuation marks and symbols. |
| 99 | * |
| 100 | * unreserved = alphanum | mark |
| 101 | * |
| 102 | * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" |
| 103 | * |
| 104 | * Unreserved characters can be escaped without changing the |
| 105 | * semantics of the URI, but this should not be done unless the |
| 106 | * URI is being used in a context that does not allow the |
| 107 | * unescaped character to appear. |
| 108 | * ----- |
| 109 | * |
| 110 | * It appears that both Netscape and Internet Explorer escape |
| 111 | * all special characters from this list with the exception |
| 112 | * of "-", "_", ".", "*". While it is not clear why they are |
| 113 | * escaping the other characters, perhaps it is safest to |
| 114 | * assume that there might be contexts in which the others |
| 115 | * are unsafe if not escaped. Therefore, we will use the same |
| 116 | * list. It is also noteworthy that this is consistent with |
| 117 | * O'Reilly's "HTML: The Definitive Guide" (page 164). |
| 118 | * |
| 119 | * As a last note, Intenet Explorer does not encode the "@" |
| 120 | * character which is clearly not unreserved according to the |
| 121 | * RFC. We are being consistent with the RFC in this matter, |
| 122 | * as is Netscape. |
| 123 | * |
| 124 | */ |
| 125 | |
| 126 | dontNeedEncoding = new BitSet(256); |
| 127 | int i; |
| 128 | for (i = 'a'; i <= 'z'; i++) { |
| 129 | dontNeedEncoding.set(i); |
| 130 | } |
| 131 | for (i = 'A'; i <= 'Z'; i++) { |
| 132 | dontNeedEncoding.set(i); |
| 133 | } |
| 134 | for (i = '0'; i <= '9'; i++) { |
| 135 | dontNeedEncoding.set(i); |
| 136 | } |
| 137 | dontNeedEncoding.set(' '); /* encoding a space to a + is done |
| 138 | * in the encode() method */ |
| 139 | dontNeedEncoding.set('-'); |
| 140 | dontNeedEncoding.set('_'); |
| 141 | dontNeedEncoding.set('.'); |
| 142 | dontNeedEncoding.set('*'); |
| 143 | |
| 144 | dfltEncName = AccessController.doPrivileged( |
| 145 | new GetPropertyAction("file.encoding") |
| 146 | ); |
| 147 | } |
| 148 | |
| 149 | /** |
| 150 | * You can't call the constructor. |
| 151 | */ |
| 152 | private URLEncoder() { } |
| 153 | |
| 154 | /** |
| 155 | * Translates a string into <code>x-www-form-urlencoded</code> |
| 156 | * format. This method uses the platform's default encoding |
| 157 | * as the encoding scheme to obtain the bytes for unsafe characters. |
| 158 | * |
| 159 | * @param s <code>String</code> to be translated. |
| 160 | * @deprecated The resulting string may vary depending on the platform's |
| 161 | * default encoding. Instead, use the encode(String,String) |
| 162 | * method to specify the encoding. |
| 163 | * @return the translated <code>String</code>. |
| 164 | */ |
| 165 | @Deprecated |
| 166 | public static String encode(String s) { |
| 167 | |
| 168 | String str = null; |
| 169 | |
| 170 | try { |
| 171 | str = encode(s, dfltEncName); |
| 172 | } catch (UnsupportedEncodingException e) { |
| 173 | // The system should always have the platform default |
| 174 | } |
| 175 | |
| 176 | return str; |
| 177 | } |
| 178 | |
| 179 | /** |
| 180 | * Translates a string into <code>application/x-www-form-urlencoded</code> |
| 181 | * format using a specific encoding scheme. This method uses the |
| 182 | * supplied encoding scheme to obtain the bytes for unsafe |
| 183 | * characters. |
| 184 | * <p> |
| 185 | * <em><strong>Note:</strong> The <a href= |
| 186 | * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars"> |
| 187 | * World Wide Web Consortium Recommendation</a> states that |
| 188 | * UTF-8 should be used. Not doing so may introduce |
| 189 | * incompatibilites.</em> |
| 190 | * |
| 191 | * @param s <code>String</code> to be translated. |
| 192 | * @param enc The name of a supported |
| 193 | * <a href="../lang/package-summary.html#charenc">character |
| 194 | * encoding</a>. |
| 195 | * @return the translated <code>String</code>. |
| 196 | * @exception UnsupportedEncodingException |
| 197 | * If the named encoding is not supported |
| 198 | * @see URLDecoder#decode(java.lang.String, java.lang.String) |
| 199 | * @since 1.4 |
| 200 | */ |
| 201 | public static String encode(String s, String enc) |
| 202 | throws UnsupportedEncodingException { |
| 203 | |
| 204 | boolean needToChange = false; |
| 205 | StringBuffer out = new StringBuffer(s.length()); |
| 206 | Charset charset; |
| 207 | CharArrayWriter charArrayWriter = new CharArrayWriter(); |
| 208 | |
| 209 | if (enc == null) |
| 210 | throw new NullPointerException("charsetName"); |
| 211 | |
| 212 | try { |
| 213 | charset = Charset.forName(enc); |
| 214 | } catch (IllegalCharsetNameException e) { |
| 215 | throw new UnsupportedEncodingException(enc); |
| 216 | } catch (UnsupportedCharsetException e) { |
| 217 | throw new UnsupportedEncodingException(enc); |
| 218 | } |
| 219 | |
| 220 | for (int i = 0; i < s.length();) { |
| 221 | int c = (int) s.charAt(i); |
| 222 | //System.out.println("Examining character: " + c); |
| 223 | if (dontNeedEncoding.get(c)) { |
| 224 | if (c == ' ') { |
| 225 | c = '+'; |
| 226 | needToChange = true; |
| 227 | } |
| 228 | //System.out.println("Storing: " + c); |
| 229 | out.append((char)c); |
| 230 | i++; |
| 231 | } else { |
| 232 | // convert to external encoding before hex conversion |
| 233 | do { |
| 234 | charArrayWriter.write(c); |
| 235 | /* |
| 236 | * If this character represents the start of a Unicode |
| 237 | * surrogate pair, then pass in two characters. It's not |
| 238 | * clear what should be done if a bytes reserved in the |
| 239 | * surrogate pairs range occurs outside of a legal |
| 240 | * surrogate pair. For now, just treat it as if it were |
| 241 | * any other character. |
| 242 | */ |
| 243 | if (c >= 0xD800 && c <= 0xDBFF) { |
| 244 | /* |
| 245 | System.out.println(Integer.toHexString(c) |
| 246 | + " is high surrogate"); |
| 247 | */ |
| 248 | if ( (i+1) < s.length()) { |
| 249 | int d = (int) s.charAt(i+1); |
| 250 | /* |
| 251 | System.out.println("\tExamining " |
| 252 | + Integer.toHexString(d)); |
| 253 | */ |
| 254 | if (d >= 0xDC00 && d <= 0xDFFF) { |
| 255 | /* |
| 256 | System.out.println("\t" |
| 257 | + Integer.toHexString(d) |
| 258 | + " is low surrogate"); |
| 259 | */ |
| 260 | charArrayWriter.write(d); |
| 261 | i++; |
| 262 | } |
| 263 | } |
| 264 | } |
| 265 | i++; |
| 266 | } while (i < s.length() && !dontNeedEncoding.get((c = (int) s.charAt(i)))); |
| 267 | |
| 268 | charArrayWriter.flush(); |
| 269 | String str = new String(charArrayWriter.toCharArray()); |
| 270 | byte[] ba = str.getBytes(charset); |
| 271 | for (int j = 0; j < ba.length; j++) { |
| 272 | out.append('%'); |
| 273 | char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16); |
| 274 | // converting to use uppercase letter as part of |
| 275 | // the hex value if ch is a letter. |
| 276 | if (Character.isLetter(ch)) { |
| 277 | ch -= caseDiff; |
| 278 | } |
| 279 | out.append(ch); |
| 280 | ch = Character.forDigit(ba[j] & 0xF, 16); |
| 281 | if (Character.isLetter(ch)) { |
| 282 | ch -= caseDiff; |
| 283 | } |
| 284 | out.append(ch); |
| 285 | } |
| 286 | charArrayWriter.reset(); |
| 287 | needToChange = true; |
| 288 | } |
| 289 | } |
| 290 | |
| 291 | return (needToChange? out.toString() : s); |
| 292 | } |
| 293 | } |