blob: 43a6fa0aaf34769bce70a30b077a273948833688 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2004-2005 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26/* Misc functions for conversion of Unicode and UTF-8 and platform encoding */
27
28#include <stdio.h>
29#include <stddef.h>
30#include <stdlib.h>
31#include <stdarg.h>
32#include <string.h>
33#include <ctype.h>
34
35#include "jni.h"
36
37#include "utf.h"
38
39/*
40 * Error handler
41 */
42void
43utfError(char *file, int line, char *message)
44{
45 (void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message);
46 abort();
47}
48
49/*
50 * Convert UTF-8 to UTF-16
51 * Returns length or -1 if output overflows.
52 */
53int JNICALL
54utf8ToUtf16(struct UtfInst *ui, jbyte *utf8, int len, unsigned short *output, int outputMaxLen)
55{
56 int outputLen;
57 int i;
58
59 UTF_ASSERT(utf8);
60 UTF_ASSERT(len>=0);
61 UTF_ASSERT(output);
62 UTF_ASSERT(outputMaxLen>0);
63
64 i = 0;
65 outputLen = 0;
66 while ( i<len ) {
67 unsigned code, x, y, z;
68
69 if ( outputLen >= outputMaxLen ) {
70 return -1;
71 }
72 x = (unsigned char)utf8[i++];
73 code = x;
74 if ( (x & 0xE0)==0xE0 ) {
75 y = (unsigned char)utf8[i++];
76 z = (unsigned char)utf8[i++];
77 code = ((x & 0xF)<<12) + ((y & 0x3F)<<6) + (z & 0x3F);
78 } else if ( (x & 0xC0)==0xC0 ) {
79 y = (unsigned char)utf8[i++];
80 code = ((x & 0x1F)<<6) + (y & 0x3F);
81 }
82 output[outputLen++] = code;
83 }
84 return outputLen;
85}
86
87/*
88 * Convert UTF-16 to UTF-8 Modified
89 * Returns length or -1 if output overflows.
90 */
91int JNICALL
92utf16ToUtf8m(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen)
93{
94 int i;
95 int outputLen;
96
97 UTF_ASSERT(utf16);
98 UTF_ASSERT(len>=0);
99 UTF_ASSERT(output);
100 UTF_ASSERT(outputMaxLen>0);
101
102 outputLen = 0;
103 for (i = 0; i < len; i++) {
104 unsigned code;
105
106 code = utf16[i];
107 if ( code >= 0x0001 && code <= 0x007F ) {
108 output[outputLen++] = code;
109 } else if ( code == 0 || ( code >= 0x0080 && code <= 0x07FF ) ) {
110 output[outputLen++] = ((code>>6) & 0x1F) | 0xC0;
111 output[outputLen++] = (code & 0x3F) | 0x80;
112 } else if ( code >= 0x0800 && code <= 0xFFFF ) {
113 output[outputLen++] = ((code>>12) & 0x0F) | 0xE0;
114 output[outputLen++] = ((code>>6) & 0x3F) | 0x80;
115 output[outputLen++] = (code & 0x3F) | 0x80;
116 }
117 if ( outputLen > outputMaxLen ) {
118 return -1;
119 }
120 }
121 output[outputLen] = 0;
122 return outputLen;
123}
124
125int JNICALL
126utf16ToUtf8s(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen)
127{
128 return -1; /* FIXUP */
129}
130
131/* Determine length of this Standard UTF-8 in Modified UTF-8.
132 * Validation is done of the basic UTF encoding rules, returns
133 * length (no change) when errors are detected in the UTF encoding.
134 *
135 * Note: Accepts Modified UTF-8 also, no verification on the
136 * correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
137 */
138int JNICALL
139utf8sToUtf8mLength(struct UtfInst *ui, jbyte *string, int length)
140{
141 int newLength;
142 int i;
143
144 newLength = 0;
145 for ( i = 0 ; i < length ; i++ ) {
146 unsigned byte;
147
148 byte = (unsigned char)string[i];
149 if ( (byte & 0x80) == 0 ) { /* 1byte encoding */
150 newLength++;
151 if ( byte == 0 ) {
152 newLength++; /* We gain one byte in length on NULL bytes */
153 }
154 } else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */
155 /* Check encoding of following bytes */
156 if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {
157 break; /* Error condition */
158 }
159 i++; /* Skip next byte */
160 newLength += 2;
161 } else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */
162 /* Check encoding of following bytes */
163 if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80
164 || (string[i+2] & 0xC0) != 0x80 ) {
165 break; /* Error condition */
166 }
167 i += 2; /* Skip next two bytes */
168 newLength += 3;
169 } else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */
170 /* Check encoding of following bytes */
171 if ( (i+3) >= length || (string[i+1] & 0xC0) != 0x80
172 || (string[i+2] & 0xC0) != 0x80
173 || (string[i+3] & 0xC0) != 0x80 ) {
174 break; /* Error condition */
175 }
176 i += 3; /* Skip next 3 bytes */
177 newLength += 6; /* 4byte encoding turns into 2 3byte ones */
178 } else {
179 break; /* Error condition */
180 }
181 }
182 if ( i != length ) {
183 /* Error in finding new length, return old length so no conversion */
184 /* FIXUP: ERROR_MESSAGE? */
185 return length;
186 }
187 return newLength;
188}
189
190/* Convert Standard UTF-8 to Modified UTF-8.
191 * Assumes the UTF-8 encoding was validated by utf8mLength() above.
192 *
193 * Note: Accepts Modified UTF-8 also, no verification on the
194 * correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
195 */
196void JNICALL
197utf8sToUtf8m(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength)
198{
199 int i;
200 int j;
201
202 j = 0;
203 for ( i = 0 ; i < length ; i++ ) {
204 unsigned byte1;
205
206 byte1 = (unsigned char)string[i];
207
208 /* NULL bytes and bytes starting with 11110xxx are special */
209 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
210 if ( byte1 == 0 ) {
211 /* Bits out: 11000000 10000000 */
212 newString[j++] = (jbyte)0xC0;
213 newString[j++] = (jbyte)0x80;
214 } else {
215 /* Single byte */
216 newString[j++] = byte1;
217 }
218 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
219 newString[j++] = byte1;
220 newString[j++] = string[++i];
221 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
222 newString[j++] = byte1;
223 newString[j++] = string[++i];
224 newString[j++] = string[++i];
225 } else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */
226 /* Beginning of 4byte encoding, turn into 2 3byte encodings */
227 unsigned byte2, byte3, byte4, u21;
228
229 /* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
230 byte2 = (unsigned char)string[++i];
231 byte3 = (unsigned char)string[++i];
232 byte4 = (unsigned char)string[++i];
233 /* Reconstruct full 21bit value */
234 u21 = (byte1 & 0x07) << 18;
235 u21 += (byte2 & 0x3F) << 12;
236 u21 += (byte3 & 0x3F) << 6;
237 u21 += (byte4 & 0x3F);
238 /* Bits out: 11101101 1010xxxx 10xxxxxx */
239 newString[j++] = (jbyte)0xED;
240 newString[j++] = (jbyte)(0xA0 + (((u21 >> 16) - 1) & 0x0F));
241 newString[j++] = (jbyte)(0x80 + ((u21 >> 10) & 0x3F));
242 /* Bits out: 11101101 1011xxxx 10xxxxxx */
243 newString[j++] = (jbyte)0xED;
244 newString[j++] = (jbyte)(0xB0 + ((u21 >> 6) & 0x0F));
245 newString[j++] = byte4;
246 }
247 }
248 UTF_ASSERT(i==length);
249 UTF_ASSERT(j==newLength);
250 newString[j] = (jbyte)0;
251}
252
253/* Given a Modified UTF-8 string, calculate the Standard UTF-8 length.
254 * Basic validation of the UTF encoding rules is done, and length is
255 * returned (no change) when errors are detected.
256 *
257 * Note: No validation is made that this is indeed Modified UTF-8 coming in.
258 *
259 */
260int JNICALL
261utf8mToUtf8sLength(struct UtfInst *ui, jbyte *string, int length)
262{
263 int newLength;
264 int i;
265
266 newLength = 0;
267 for ( i = 0 ; i < length ; i++ ) {
268 unsigned byte1, byte2, byte3, byte4, byte5, byte6;
269
270 byte1 = (unsigned char)string[i];
271 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
272 newLength++;
273 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
274 /* Check encoding of following bytes */
275 if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {
276 break; /* Error condition */
277 }
278 byte2 = (unsigned char)string[++i];
279 if ( byte1 != 0xC0 || byte2 != 0x80 ) {
280 newLength += 2; /* Normal 2byte encoding, not 0xC080 */
281 } else {
282 newLength++; /* We will turn 0xC080 into 0 */
283 }
284 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
285 /* Check encoding of following bytes */
286 if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80
287 || (string[i+2] & 0xC0) != 0x80 ) {
288 break; /* Error condition */
289 }
290 byte2 = (unsigned char)string[++i];
291 byte3 = (unsigned char)string[++i];
292 newLength += 3;
293 /* Possible process a second 3byte encoding */
294 if ( (i+3) < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
295 /* See if this is a pair of 3byte encodings */
296 byte4 = (unsigned char)string[i+1];
297 byte5 = (unsigned char)string[i+2];
298 byte6 = (unsigned char)string[i+3];
299 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
300 /* Check encoding of 3rd byte */
301 if ( (byte6 & 0xC0) != 0x80 ) {
302 break; /* Error condition */
303 }
304 newLength++; /* New string will have 4byte encoding */
305 i += 3; /* Skip next 3 bytes */
306 }
307 }
308 } else {
309 break; /* Error condition */
310 }
311 }
312 if ( i != length ) {
313 /* Error in UTF encoding */
314 /* FIXUP: ERROR_MESSAGE()? */
315 return length;
316 }
317 return newLength;
318}
319
320/* Convert a Modified UTF-8 string into a Standard UTF-8 string
321 * It is assumed that this string has been validated in terms of the
322 * basic UTF encoding rules by utf8Length() above.
323 *
324 * Note: No validation is made that this is indeed Modified UTF-8 coming in.
325 *
326 */
327void JNICALL
328utf8mToUtf8s(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength)
329{
330 int i;
331 int j;
332
333 j = 0;
334 for ( i = 0 ; i < length ; i++ ) {
335 unsigned byte1, byte2, byte3, byte4, byte5, byte6;
336
337 byte1 = (unsigned char)string[i];
338 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
339 /* Single byte */
340 newString[j++] = byte1;
341 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
342 byte2 = (unsigned char)string[++i];
343 if ( byte1 != 0xC0 || byte2 != 0x80 ) {
344 newString[j++] = byte1;
345 newString[j++] = byte2;
346 } else {
347 newString[j++] = 0;
348 }
349 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
350 byte2 = (unsigned char)string[++i];
351 byte3 = (unsigned char)string[++i];
352 if ( i+3 < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
353 /* See if this is a pair of 3byte encodings */
354 byte4 = (unsigned char)string[i+1];
355 byte5 = (unsigned char)string[i+2];
356 byte6 = (unsigned char)string[i+3];
357 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
358 unsigned u21;
359
360 /* Bits in: 11101101 1010xxxx 10xxxxxx */
361 /* Bits in: 11101101 1011xxxx 10xxxxxx */
362 i += 3;
363
364 /* Reconstruct 21 bit code */
365 u21 = ((byte2 & 0x0F) + 1) << 16;
366 u21 += (byte3 & 0x3F) << 10;
367 u21 += (byte5 & 0x0F) << 6;
368 u21 += (byte6 & 0x3F);
369
370 /* Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
371
372 /* Convert to 4byte encoding */
373 newString[j++] = 0xF0 + ((u21 >> 18) & 0x07);
374 newString[j++] = 0x80 + ((u21 >> 12) & 0x3F);
375 newString[j++] = 0x80 + ((u21 >> 6) & 0x3F);
376 newString[j++] = 0x80 + (u21 & 0x3F);
377 continue;
378 }
379 }
380 /* Normal 3byte encoding */
381 newString[j++] = byte1;
382 newString[j++] = byte2;
383 newString[j++] = byte3;
384 }
385 }
386 UTF_ASSERT(i==length);
387 UTF_ASSERT(j==newLength);
388 newString[j] = 0;
389}
390
391/* ================================================================= */
392
393#if 1 /* Test program */
394
395/*
396 * Convert any byte array into a printable string.
397 * Returns length or -1 if output overflows.
398 */
399static int
400bytesToPrintable(struct UtfInst *ui, char *bytes, int len, char *output, int outputMaxLen)
401{
402 int outputLen;
403 int i;
404
405 UTF_ASSERT(bytes);
406 UTF_ASSERT(len>=0);
407 UTF_ASSERT(output);
408 UTF_ASSERT(outputMaxLen>=0);
409
410 outputLen = 0;
411 for ( i=0; i<len ; i++ ) {
412 unsigned byte;
413
414 byte = bytes[i];
415 if ( outputLen >= outputMaxLen ) {
416 return -1;
417 }
418 if ( byte <= 0x7f && isprint(byte) && !iscntrl(byte) ) {
419 output[outputLen++] = (char)byte;
420 } else {
421 (void)sprintf(output+outputLen,"\\x%02x",byte);
422 outputLen += 4;
423 }
424 }
425 output[outputLen] = 0;
426 return outputLen;
427}
428
429static void
430test(void)
431{
432 static char *strings[] = {
433 "characters",
434 "abcdefghijklmnopqrstuvwxyz",
435 "0123456789",
436 "!@#$%^&*()_+=-{}[]:;",
437 NULL };
438 int i;
439 struct UtfInst *ui;
440
441 ui = utfInitialize(NULL);
442
443 i = 0;
444 while ( strings[i] != NULL ) {
445 char *str;
446 #define MAX 1024
447 char buf0[MAX];
448 char buf1[MAX];
449 char buf2[MAX];
450 unsigned short buf3[MAX];
451 int len1;
452 int len2;
453 int len3;
454
455 str = strings[i];
456
457 (void)bytesToPrintable(ui, str, (int)strlen(str), buf0, 1024);
458
459 len1 = utf8FromPlatform(ui, str, (int)strlen(str), (jbyte*)buf1, 1024);
460
461 UTF_ASSERT(len1==(int)strlen(str));
462
463 len3 = utf8ToUtf16(ui, (jbyte*)buf1, len1, (jchar*)buf3, 1024);
464
465 UTF_ASSERT(len3==len1);
466
467 len1 = utf16ToUtf8m(ui, (jchar*)buf3, len3, (jbyte*)buf1, 1024);
468
469 UTF_ASSERT(len1==len3);
470 UTF_ASSERT(strcmp(str, buf1) == 0);
471
472 len2 = utf8ToPlatform(ui, (jbyte*)buf1, len1, buf2, 1024);
473
474 UTF_ASSERT(len2==len1);
475 UTF_ASSERT(strcmp(str, buf2) == 0);
476
477 i++;
478 }
479
480 utfTerminate(ui, NULL);
481
482}
483
484int
485main(int argc, char **argv)
486{
487 test();
488 return 0;
489}
490
491#endif