J. Duke | 319a3b9 | 2007-12-01 00:00:00 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright 1997-2003 Sun Microsystems, Inc. All Rights Reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. Sun designates this |
| 8 | * particular file as subject to the "Classpath" exception as provided |
| 9 | * by Sun in the LICENSE file that accompanied this code. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 22 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 23 | * have any questions. |
| 24 | */ |
| 25 | |
| 26 | package sun.io; |
| 27 | |
| 28 | import sun.nio.cs.ext.IBM949C; |
| 29 | |
| 30 | /** |
| 31 | * @author Malcolm Ayres |
| 32 | */ |
| 33 | |
| 34 | /* |
| 35 | Cp949C is a hand-modified version of Cp949 |
| 36 | maps Unicode U-005C <-> 0x5C (local code page) |
| 37 | */ |
| 38 | |
| 39 | public class CharToByteCp949C extends CharToByteConverter |
| 40 | { |
| 41 | private static final char SBase = '\uAC00'; |
| 42 | private static final char LBase = '\u1100'; |
| 43 | private static final char VBase = '\u1161'; |
| 44 | private static final char TBase = '\u11A7'; |
| 45 | private static final int VCount = 21; |
| 46 | private static final int TCount = 28; |
| 47 | private static final byte G0 = 0; |
| 48 | private static final byte G1 = 1; |
| 49 | private static final byte G2 = 2; |
| 50 | private static final byte G3 = 3; |
| 51 | private byte charState = G0; |
| 52 | private char l, v, t; |
| 53 | |
| 54 | private byte[] outputByte; |
| 55 | |
| 56 | private char highHalfZoneCode; |
| 57 | private int mask1; |
| 58 | private int mask2; |
| 59 | private int shift; |
| 60 | private short[] index1; |
| 61 | private String index2; |
| 62 | private String index2a; |
| 63 | |
| 64 | private final static IBM949C nioCoder = new IBM949C(); |
| 65 | |
| 66 | public CharToByteCp949C() { |
| 67 | super(); |
| 68 | index1 = nioCoder.getEncoderIndex1(); |
| 69 | index2 = nioCoder.getEncoderIndex2(); |
| 70 | index2a = nioCoder.getEncoderIndex2a(); |
| 71 | highHalfZoneCode = 0; |
| 72 | outputByte = new byte[2]; |
| 73 | mask1 = 0xFFF8; |
| 74 | mask2 = 0x0007; |
| 75 | shift = 3; |
| 76 | } |
| 77 | |
| 78 | /** |
| 79 | * flush out any residual data and reset the buffer state |
| 80 | */ |
| 81 | public int flush(byte[] output, int outStart, int outEnd) |
| 82 | throws MalformedInputException, |
| 83 | ConversionBufferFullException |
| 84 | { |
| 85 | int bytesOut; |
| 86 | |
| 87 | byteOff = outStart; |
| 88 | |
| 89 | if (highHalfZoneCode != 0) { |
| 90 | reset(); |
| 91 | badInputLength = 0; |
| 92 | throw new MalformedInputException(); |
| 93 | } |
| 94 | |
| 95 | if (charState != G0) { |
| 96 | try { |
| 97 | unicodeToBuffer(composeHangul() ,output, outEnd); |
| 98 | } |
| 99 | catch(UnknownCharacterException e) { |
| 100 | reset(); |
| 101 | badInputLength = 0; |
| 102 | throw new MalformedInputException(); |
| 103 | } |
| 104 | charState = G0; |
| 105 | } |
| 106 | |
| 107 | bytesOut = byteOff - outStart; |
| 108 | |
| 109 | reset(); |
| 110 | return bytesOut; |
| 111 | } |
| 112 | |
| 113 | /** |
| 114 | * Resets converter to its initial state. |
| 115 | */ |
| 116 | public void reset() { |
| 117 | highHalfZoneCode = 0; |
| 118 | charState = G0; |
| 119 | charOff = byteOff = 0; |
| 120 | } |
| 121 | |
| 122 | /** |
| 123 | * Returns true if the given character can be converted to the |
| 124 | * target character encoding. |
| 125 | */ |
| 126 | public boolean canConvert(char ch) { |
| 127 | int index; |
| 128 | int theBytes; |
| 129 | |
| 130 | index = index1[((ch & mask1) >> shift)] + (ch & mask2); |
| 131 | if (index < 15000) |
| 132 | theBytes = (int)(index2.charAt(index)); |
| 133 | else |
| 134 | theBytes = (int)(index2a.charAt(index-15000)); |
| 135 | |
| 136 | if (theBytes != 0) |
| 137 | return (true); |
| 138 | |
| 139 | // only return true if input char was unicode null - all others are |
| 140 | // undefined |
| 141 | return( ch == '\u0000'); |
| 142 | } |
| 143 | |
| 144 | /** |
| 145 | * Character conversion |
| 146 | */ |
| 147 | |
| 148 | public int convert(char[] input, int inOff, int inEnd, |
| 149 | byte[] output, int outOff, int outEnd) |
| 150 | throws UnknownCharacterException, MalformedInputException, |
| 151 | ConversionBufferFullException |
| 152 | { |
| 153 | char inputChar; |
| 154 | int inputSize; |
| 155 | |
| 156 | charOff = inOff; |
| 157 | byteOff = outOff; |
| 158 | |
| 159 | while (charOff < inEnd) { |
| 160 | |
| 161 | if (highHalfZoneCode == 0) { |
| 162 | inputChar = input[charOff]; |
| 163 | inputSize = 1; |
| 164 | } else { |
| 165 | inputChar = highHalfZoneCode; |
| 166 | inputSize = 0; |
| 167 | highHalfZoneCode = 0; |
| 168 | } |
| 169 | |
| 170 | switch (charState) { |
| 171 | case G0: |
| 172 | |
| 173 | l = LBase; |
| 174 | v = VBase; |
| 175 | t = TBase; |
| 176 | |
| 177 | if ( isLeadingC(inputChar) ) { // Leading Consonant |
| 178 | l = inputChar; |
| 179 | charState = G1; |
| 180 | break; |
| 181 | } |
| 182 | |
| 183 | if ( isVowel(inputChar) ) { // Vowel |
| 184 | v = inputChar; |
| 185 | charState = G2; |
| 186 | break; |
| 187 | } |
| 188 | |
| 189 | if ( isTrailingC(inputChar) ) { // Trailing Consonant |
| 190 | t = inputChar; |
| 191 | charState = G3; |
| 192 | break; |
| 193 | } |
| 194 | |
| 195 | break; |
| 196 | |
| 197 | case G1: |
| 198 | if ( isLeadingC(inputChar) ) { // Leading Consonant |
| 199 | l = composeLL(l, inputChar); |
| 200 | break; |
| 201 | } |
| 202 | |
| 203 | if ( isVowel(inputChar) ) { // Vowel |
| 204 | v = inputChar; |
| 205 | charState = G2; |
| 206 | break; |
| 207 | } |
| 208 | |
| 209 | if ( isTrailingC(inputChar) ) { // Trailing Consonant |
| 210 | t = inputChar; |
| 211 | charState = G3; |
| 212 | break; |
| 213 | } |
| 214 | |
| 215 | unicodeToBuffer(composeHangul(), output, outEnd); |
| 216 | |
| 217 | charState = G0; |
| 218 | break; |
| 219 | |
| 220 | case G2: |
| 221 | if ( isLeadingC(inputChar) ) { // Leading Consonant |
| 222 | |
| 223 | unicodeToBuffer(composeHangul(), output, outEnd); |
| 224 | |
| 225 | l = inputChar; |
| 226 | v = VBase; |
| 227 | t = TBase; |
| 228 | charState = G1; |
| 229 | break; |
| 230 | } |
| 231 | |
| 232 | if ( isVowel(inputChar) ) { // Vowel |
| 233 | v = composeVV(l, inputChar); |
| 234 | charState = G2; |
| 235 | break; |
| 236 | } |
| 237 | |
| 238 | if ( isTrailingC(inputChar) ) { // Trailing Consonant |
| 239 | t = inputChar; |
| 240 | charState = G3; |
| 241 | break; |
| 242 | } |
| 243 | |
| 244 | unicodeToBuffer(composeHangul(), output, outEnd); |
| 245 | |
| 246 | charState = G0; |
| 247 | |
| 248 | break; |
| 249 | |
| 250 | case G3: |
| 251 | if ( isTrailingC(inputChar) ) { // Trailing Consonant |
| 252 | t = composeTT(t, inputChar); |
| 253 | charState = G3; |
| 254 | break; |
| 255 | } |
| 256 | |
| 257 | unicodeToBuffer(composeHangul(), output, outEnd); |
| 258 | |
| 259 | charState = G0; |
| 260 | |
| 261 | break; |
| 262 | } |
| 263 | |
| 264 | if (charState != G0) |
| 265 | charOff++; |
| 266 | else { |
| 267 | |
| 268 | // Is this a high surrogate? |
| 269 | if(inputChar >= '\ud800' && inputChar <= '\udbff') { |
| 270 | // Is this the last character of the input? |
| 271 | if (charOff + inputSize >= inEnd) { |
| 272 | highHalfZoneCode = inputChar; |
| 273 | charOff += inputSize; |
| 274 | break; |
| 275 | } |
| 276 | |
| 277 | // Is there a low surrogate following? |
| 278 | inputChar = input[charOff + inputSize]; |
| 279 | if (inputChar >= '\udc00' && inputChar <= '\udfff') { |
| 280 | // We have a valid surrogate pair. Too bad we don't do |
| 281 | // surrogates. Is substitution enabled? |
| 282 | if (subMode) { |
| 283 | if (subBytes.length == 1) { |
| 284 | outputByte[0] = 0x00; |
| 285 | outputByte[1] = subBytes[0]; |
| 286 | } else { |
| 287 | outputByte[0] = subBytes[0]; |
| 288 | outputByte[1] = subBytes[1]; |
| 289 | } |
| 290 | |
| 291 | bytesToBuffer(outputByte, output, outEnd); |
| 292 | inputSize++; |
| 293 | } else { |
| 294 | badInputLength = 2; |
| 295 | throw new UnknownCharacterException(); |
| 296 | } |
| 297 | } else { |
| 298 | // We have a malformed surrogate pair |
| 299 | badInputLength = 1; |
| 300 | throw new MalformedInputException(); |
| 301 | } |
| 302 | } |
| 303 | |
| 304 | // Is this an unaccompanied low surrogate? |
| 305 | else |
| 306 | if (inputChar >= '\uDC00' && inputChar <= '\uDFFF') { |
| 307 | badInputLength = 1; |
| 308 | throw new MalformedInputException(); |
| 309 | } else { |
| 310 | unicodeToBuffer(inputChar, output, outEnd); |
| 311 | } |
| 312 | |
| 313 | charOff += inputSize; |
| 314 | |
| 315 | } |
| 316 | |
| 317 | } |
| 318 | |
| 319 | return byteOff - outOff; |
| 320 | |
| 321 | } |
| 322 | |
| 323 | private char composeHangul() { |
| 324 | int lIndex, vIndex, tIndex; |
| 325 | |
| 326 | lIndex = l - LBase; |
| 327 | vIndex = v - VBase; |
| 328 | tIndex = t - TBase; |
| 329 | |
| 330 | return (char)((lIndex * VCount + vIndex) * TCount + tIndex + SBase); |
| 331 | } |
| 332 | |
| 333 | private char composeLL(char l1, char l2) { |
| 334 | return l2; |
| 335 | } |
| 336 | |
| 337 | private char composeVV(char v1, char v2) { |
| 338 | return v2; |
| 339 | } |
| 340 | |
| 341 | private char composeTT(char t1, char t2) { |
| 342 | return t2; |
| 343 | } |
| 344 | |
| 345 | private boolean isLeadingC(char c) { |
| 346 | return (c >= LBase && c <= '\u1159'); |
| 347 | } |
| 348 | |
| 349 | private boolean isVowel(char c) { |
| 350 | return (c >= VBase && c <= '\u11a2'); |
| 351 | } |
| 352 | |
| 353 | private boolean isTrailingC(char c) { |
| 354 | return (c >= TBase && c <= '\u11f9'); |
| 355 | } |
| 356 | |
| 357 | /** |
| 358 | * returns the maximum number of bytes needed to convert a char |
| 359 | */ |
| 360 | public int getMaxBytesPerChar() { |
| 361 | return 2; |
| 362 | } |
| 363 | |
| 364 | |
| 365 | /** |
| 366 | * Return the character set ID |
| 367 | */ |
| 368 | public String getCharacterEncoding() { |
| 369 | return "Cp949C"; |
| 370 | } |
| 371 | |
| 372 | /** |
| 373 | * private function to add the bytes to the output buffer |
| 374 | */ |
| 375 | private void bytesToBuffer(byte[] theBytes, byte[] output, int outEnd) |
| 376 | throws ConversionBufferFullException, |
| 377 | UnknownCharacterException { |
| 378 | |
| 379 | int spaceNeeded; |
| 380 | |
| 381 | // ensure sufficient space for the bytes(s) |
| 382 | |
| 383 | if (theBytes[0] == 0x00) |
| 384 | spaceNeeded = 1; |
| 385 | else |
| 386 | spaceNeeded = 2; |
| 387 | |
| 388 | if (byteOff + spaceNeeded > outEnd) |
| 389 | throw new ConversionBufferFullException(); |
| 390 | |
| 391 | // move the data into the buffer |
| 392 | |
| 393 | if (spaceNeeded == 1) |
| 394 | output[byteOff++] = theBytes[1]; |
| 395 | else { |
| 396 | output[byteOff++] = theBytes[0]; |
| 397 | output[byteOff++] = theBytes[1]; |
| 398 | } |
| 399 | |
| 400 | } |
| 401 | |
| 402 | /** |
| 403 | * private function to add a unicode character to the output buffer |
| 404 | */ |
| 405 | private void unicodeToBuffer(char unicode, byte[] output, int outEnd) |
| 406 | throws ConversionBufferFullException, |
| 407 | UnknownCharacterException { |
| 408 | |
| 409 | int index; |
| 410 | int theBytes; |
| 411 | |
| 412 | // first we convert the unicode to its byte representation |
| 413 | |
| 414 | index = index1[((unicode & mask1) >> shift)] + (unicode & mask2); |
| 415 | if (index < 15000) |
| 416 | theBytes = (int)(index2.charAt(index)); |
| 417 | else |
| 418 | theBytes = (int)(index2a.charAt(index-15000)); |
| 419 | outputByte[0] = (byte)((theBytes & 0x0000ff00)>>8); |
| 420 | outputByte[1] = (byte)(theBytes & 0x000000ff); |
| 421 | |
| 422 | // if the unicode was not mappable - look for the substitution bytes |
| 423 | |
| 424 | if (outputByte[0] == 0x00 && outputByte[1] == 0x00 |
| 425 | && unicode != '\u0000') { |
| 426 | if (subMode) { |
| 427 | if (subBytes.length == 1) { |
| 428 | outputByte[0] = 0x00; |
| 429 | outputByte[1] = subBytes[0]; |
| 430 | } else { |
| 431 | outputByte[0] = subBytes[0]; |
| 432 | outputByte[1] = subBytes[1]; |
| 433 | } |
| 434 | } else { |
| 435 | badInputLength = 1; |
| 436 | throw new UnknownCharacterException(); |
| 437 | } |
| 438 | } |
| 439 | |
| 440 | // now put the bytes in the buffer |
| 441 | |
| 442 | bytesToBuffer(outputByte, output, outEnd); |
| 443 | |
| 444 | } |
| 445 | } |