J. Duke | 319a3b9 | 2007-12-01 00:00:00 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright 2000-2001 Sun Microsystems, Inc. All Rights Reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. Sun designates this |
| 8 | * particular file as subject to the "Classpath" exception as provided |
| 9 | * by Sun in the LICENSE file that accompanied this code. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 22 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 23 | * have any questions. |
| 24 | */ |
| 25 | |
| 26 | package sun.nio.cs; |
| 27 | |
| 28 | import java.nio.CharBuffer; |
| 29 | import java.nio.charset.CoderResult; |
| 30 | import java.nio.charset.MalformedInputException; |
| 31 | import java.nio.charset.UnmappableCharacterException; |
| 32 | |
| 33 | |
| 34 | /** |
| 35 | * Utility class for dealing with surrogates. |
| 36 | * |
| 37 | * @author Mark Reinhold |
| 38 | */ |
| 39 | |
| 40 | public class Surrogate { |
| 41 | |
| 42 | private Surrogate() { } |
| 43 | |
| 44 | // UTF-16 surrogate-character ranges |
| 45 | // |
| 46 | public static final char MIN_HIGH = '\uD800'; |
| 47 | public static final char MAX_HIGH = '\uDBFF'; |
| 48 | public static final char MIN_LOW = '\uDC00'; |
| 49 | public static final char MAX_LOW = '\uDFFF'; |
| 50 | public static final char MIN = MIN_HIGH; |
| 51 | public static final char MAX = MAX_LOW; |
| 52 | |
| 53 | // Range of UCS-4 values that need surrogates in UTF-16 |
| 54 | // |
| 55 | public static final int UCS4_MIN = 0x10000; |
| 56 | public static final int UCS4_MAX = (1 << 20) + UCS4_MIN - 1; |
| 57 | |
| 58 | /** |
| 59 | * Tells whether or not the given UTF-16 value is a high surrogate. |
| 60 | */ |
| 61 | public static boolean isHigh(int c) { |
| 62 | return (MIN_HIGH <= c) && (c <= MAX_HIGH); |
| 63 | } |
| 64 | |
| 65 | /** |
| 66 | * Tells whether or not the given UTF-16 value is a low surrogate. |
| 67 | */ |
| 68 | public static boolean isLow(int c) { |
| 69 | return (MIN_LOW <= c) && (c <= MAX_LOW); |
| 70 | } |
| 71 | |
| 72 | /** |
| 73 | * Tells whether or not the given UTF-16 value is a surrogate character, |
| 74 | */ |
| 75 | public static boolean is(int c) { |
| 76 | return (MIN <= c) && (c <= MAX); |
| 77 | } |
| 78 | |
| 79 | /** |
| 80 | * Tells whether or not the given UCS-4 character must be represented as a |
| 81 | * surrogate pair in UTF-16. |
| 82 | */ |
| 83 | public static boolean neededFor(int uc) { |
| 84 | return (uc >= UCS4_MIN) && (uc <= UCS4_MAX); |
| 85 | } |
| 86 | |
| 87 | /** |
| 88 | * Returns the high UTF-16 surrogate for the given UCS-4 character. |
| 89 | */ |
| 90 | public static char high(int uc) { |
| 91 | assert neededFor(uc); |
| 92 | return (char)(0xd800 | (((uc - UCS4_MIN) >> 10) & 0x3ff)); |
| 93 | } |
| 94 | |
| 95 | /** |
| 96 | * Returns the low UTF-16 surrogate for the given UCS-4 character. |
| 97 | */ |
| 98 | public static char low(int uc) { |
| 99 | assert neededFor(uc); |
| 100 | return (char)(0xdc00 | ((uc - UCS4_MIN) & 0x3ff)); |
| 101 | } |
| 102 | |
| 103 | /** |
| 104 | * Converts the given surrogate pair into a 32-bit UCS-4 character. |
| 105 | */ |
| 106 | public static int toUCS4(char c, char d) { |
| 107 | assert isHigh(c) && isLow(d); |
| 108 | return (((c & 0x3ff) << 10) | (d & 0x3ff)) + 0x10000; |
| 109 | } |
| 110 | |
| 111 | /** |
| 112 | * Surrogate parsing support. Charset implementations may use instances of |
| 113 | * this class to handle the details of parsing UTF-16 surrogate pairs. |
| 114 | */ |
| 115 | public static class Parser { |
| 116 | |
| 117 | public Parser() { } |
| 118 | |
| 119 | private int character; // UCS-4 |
| 120 | private CoderResult error = CoderResult.UNDERFLOW; |
| 121 | private boolean isPair; |
| 122 | |
| 123 | /** |
| 124 | * Returns the UCS-4 character previously parsed. |
| 125 | */ |
| 126 | public int character() { |
| 127 | assert (error == null); |
| 128 | return character; |
| 129 | } |
| 130 | |
| 131 | /** |
| 132 | * Tells whether or not the previously-parsed UCS-4 character was |
| 133 | * originally represented by a surrogate pair. |
| 134 | */ |
| 135 | public boolean isPair() { |
| 136 | assert (error == null); |
| 137 | return isPair; |
| 138 | } |
| 139 | |
| 140 | /** |
| 141 | * Returns the number of UTF-16 characters consumed by the previous |
| 142 | * parse. |
| 143 | */ |
| 144 | public int increment() { |
| 145 | assert (error == null); |
| 146 | return isPair ? 2 : 1; |
| 147 | } |
| 148 | |
| 149 | /** |
| 150 | * If the previous parse operation detected an error, return the object |
| 151 | * describing that error. |
| 152 | */ |
| 153 | public CoderResult error() { |
| 154 | assert (error != null); |
| 155 | return error; |
| 156 | } |
| 157 | |
| 158 | /** |
| 159 | * Returns an unmappable-input result object, with the appropriate |
| 160 | * input length, for the previously-parsed character. |
| 161 | */ |
| 162 | public CoderResult unmappableResult() { |
| 163 | assert (error == null); |
| 164 | return CoderResult.unmappableForLength(isPair ? 2 : 1); |
| 165 | } |
| 166 | |
| 167 | /** |
| 168 | * Parses a UCS-4 character from the given source buffer, handling |
| 169 | * surrogates. |
| 170 | * |
| 171 | * @param c The first character |
| 172 | * @param in The source buffer, from which one more character |
| 173 | * will be consumed if c is a high surrogate |
| 174 | * |
| 175 | * @returns Either a parsed UCS-4 character, in which case the isPair() |
| 176 | * and increment() methods will return meaningful values, or |
| 177 | * -1, in which case error() will return a descriptive result |
| 178 | * object |
| 179 | */ |
| 180 | public int parse(char c, CharBuffer in) { |
| 181 | if (Surrogate.isHigh(c)) { |
| 182 | if (!in.hasRemaining()) { |
| 183 | error = CoderResult.UNDERFLOW; |
| 184 | return -1; |
| 185 | } |
| 186 | char d = in.get(); |
| 187 | if (Surrogate.isLow(d)) { |
| 188 | character = toUCS4(c, d); |
| 189 | isPair = true; |
| 190 | error = null; |
| 191 | return character; |
| 192 | } |
| 193 | error = CoderResult.malformedForLength(1); |
| 194 | return -1; |
| 195 | } |
| 196 | if (Surrogate.isLow(c)) { |
| 197 | error = CoderResult.malformedForLength(1); |
| 198 | return -1; |
| 199 | } |
| 200 | character = c; |
| 201 | isPair = false; |
| 202 | error = null; |
| 203 | return character; |
| 204 | } |
| 205 | |
| 206 | /** |
| 207 | * Parses a UCS-4 character from the given source buffer, handling |
| 208 | * surrogates. |
| 209 | * |
| 210 | * @param c The first character |
| 211 | * @param ia The input array, from which one more character |
| 212 | * will be consumed if c is a high surrogate |
| 213 | * @param ip The input index |
| 214 | * @param il The input limit |
| 215 | * |
| 216 | * @returns Either a parsed UCS-4 character, in which case the isPair() |
| 217 | * and increment() methods will return meaningful values, or |
| 218 | * -1, in which case error() will return a descriptive result |
| 219 | * object |
| 220 | */ |
| 221 | public int parse(char c, char[] ia, int ip, int il) { |
| 222 | assert (ia[ip] == c); |
| 223 | if (Surrogate.isHigh(c)) { |
| 224 | if (il - ip < 2) { |
| 225 | error = CoderResult.UNDERFLOW; |
| 226 | return -1; |
| 227 | } |
| 228 | char d = ia[ip + 1]; |
| 229 | if (Surrogate.isLow(d)) { |
| 230 | character = toUCS4(c, d); |
| 231 | isPair = true; |
| 232 | error = null; |
| 233 | return character; |
| 234 | } |
| 235 | error = CoderResult.malformedForLength(1); |
| 236 | return -1; |
| 237 | } |
| 238 | if (Surrogate.isLow(c)) { |
| 239 | error = CoderResult.malformedForLength(1); |
| 240 | return -1; |
| 241 | } |
| 242 | character = c; |
| 243 | isPair = false; |
| 244 | error = null; |
| 245 | return character; |
| 246 | } |
| 247 | |
| 248 | } |
| 249 | |
| 250 | /** |
| 251 | * Surrogate generation support. Charset implementations may use instances |
| 252 | * of this class to handle the details of generating UTF-16 surrogate |
| 253 | * pairs. |
| 254 | */ |
| 255 | public static class Generator { |
| 256 | |
| 257 | public Generator() { } |
| 258 | |
| 259 | private CoderResult error = CoderResult.OVERFLOW; |
| 260 | |
| 261 | /** |
| 262 | * If the previous generation operation detected an error, return the |
| 263 | * object describing that error. |
| 264 | */ |
| 265 | public CoderResult error() { |
| 266 | assert error != null; |
| 267 | return error; |
| 268 | } |
| 269 | |
| 270 | /** |
| 271 | * Generates one or two UTF-16 characters to represent the given UCS-4 |
| 272 | * character. |
| 273 | * |
| 274 | * @param uc The UCS-4 character |
| 275 | * @param len The number of input bytes from which the UCS-4 value |
| 276 | * was constructed (used when creating result objects) |
| 277 | * @param dst The destination buffer, to which one or two UTF-16 |
| 278 | * characters will be written |
| 279 | * |
| 280 | * @returns Either a positive count of the number of UTF-16 characters |
| 281 | * written to the destination buffer, or -1, in which case |
| 282 | * error() will return a descriptive result object |
| 283 | */ |
| 284 | public int generate(int uc, int len, CharBuffer dst) { |
| 285 | if (uc <= 0xffff) { |
| 286 | if (Surrogate.is(uc)) { |
| 287 | error = CoderResult.malformedForLength(len); |
| 288 | return -1; |
| 289 | } |
| 290 | if (dst.remaining() < 1) { |
| 291 | error = CoderResult.OVERFLOW; |
| 292 | return -1; |
| 293 | } |
| 294 | dst.put((char)uc); |
| 295 | error = null; |
| 296 | return 1; |
| 297 | } |
| 298 | if (uc < Surrogate.UCS4_MIN) { |
| 299 | error = CoderResult.malformedForLength(len); |
| 300 | return -1; |
| 301 | } |
| 302 | if (uc <= Surrogate.UCS4_MAX) { |
| 303 | if (dst.remaining() < 2) { |
| 304 | error = CoderResult.OVERFLOW; |
| 305 | return -1; |
| 306 | } |
| 307 | dst.put(Surrogate.high(uc)); |
| 308 | dst.put(Surrogate.low(uc)); |
| 309 | error = null; |
| 310 | return 2; |
| 311 | } |
| 312 | error = CoderResult.unmappableForLength(len); |
| 313 | return -1; |
| 314 | } |
| 315 | |
| 316 | /** |
| 317 | * Generates one or two UTF-16 characters to represent the given UCS-4 |
| 318 | * character. |
| 319 | * |
| 320 | * @param uc The UCS-4 character |
| 321 | * @param len The number of input bytes from which the UCS-4 value |
| 322 | * was constructed (used when creating result objects) |
| 323 | * @param da The destination array, to which one or two UTF-16 |
| 324 | * characters will be written |
| 325 | * @param dp The destination position |
| 326 | * @param dl The destination limit |
| 327 | * |
| 328 | * @returns Either a positive count of the number of UTF-16 characters |
| 329 | * written to the destination buffer, or -1, in which case |
| 330 | * error() will return a descriptive result object |
| 331 | */ |
| 332 | public int generate(int uc, int len, char[] da, int dp, int dl) { |
| 333 | if (uc <= 0xffff) { |
| 334 | if (Surrogate.is(uc)) { |
| 335 | error = CoderResult.malformedForLength(len); |
| 336 | return -1; |
| 337 | } |
| 338 | if (dl - dp < 1) { |
| 339 | error = CoderResult.OVERFLOW; |
| 340 | return -1; |
| 341 | } |
| 342 | da[dp] = (char)uc; |
| 343 | error = null; |
| 344 | return 1; |
| 345 | } |
| 346 | if (uc < Surrogate.UCS4_MIN) { |
| 347 | error = CoderResult.malformedForLength(len); |
| 348 | return -1; |
| 349 | } |
| 350 | if (uc <= Surrogate.UCS4_MAX) { |
| 351 | if (dl - dp < 2) { |
| 352 | error = CoderResult.OVERFLOW; |
| 353 | return -1; |
| 354 | } |
| 355 | da[dp] = Surrogate.high(uc); |
| 356 | da[dp + 1] = Surrogate.low(uc); |
| 357 | error = null; |
| 358 | return 2; |
| 359 | } |
| 360 | error = CoderResult.unmappableForLength(len); |
| 361 | return -1; |
| 362 | } |
| 363 | |
| 364 | } |
| 365 | |
| 366 | } |