blob: e72577ed6d26a31f15051444dfa47ecd2ea48631 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
Daniel Veillard14fff061999-06-22 21:49:07 +000015 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard891e4041998-10-19 00:43:02 +000016 *
17 * See Copyright for the status of this software.
18 *
Daniel Veillard891e4041998-10-19 00:43:02 +000019 * Daniel.Veillard@w3.org
20 */
21
Daniel Veillardb96e6431999-08-29 21:02:19 +000022#include "config.h"
Daniel Veillard27d88741999-05-29 11:51:49 +000023#include <ctype.h>
Daniel Veillard14fff061999-06-22 21:49:07 +000024#include <string.h>
25#include <stdio.h>
Daniel Veillard891e4041998-10-19 00:43:02 +000026#include "encoding.h"
Daniel Veillardb05deb71999-08-10 19:04:08 +000027#ifdef HAVE_UNICODE_H
28#include <unicode.h>
29#endif
Daniel Veillard6454aec1999-09-02 22:04:43 +000030#include "xmlmemory.h"
Daniel Veillard891e4041998-10-19 00:43:02 +000031
Daniel Veillardb05deb71999-08-10 19:04:08 +000032#ifdef HAVE_UNICODE_H
33
34#else /* ! HAVE_UNICODE_H */
Daniel Veillard0ba4d531998-11-01 19:34:31 +000035/*
36 * From rfc2044: encoding of the Unicode values on UTF-8:
37 *
38 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
39 * 0000 0000-0000 007F 0xxxxxxx
40 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
41 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
42 *
43 * I hope we won't use values > 0xFFFF anytime soon !
44 */
45
Daniel Veillard97b58771998-10-20 06:14:16 +000046/**
47 * isolat1ToUTF8:
48 * @out: a pointer ot an array of bytes to store the result
49 * @outlen: the lenght of @out
50 * @in: a pointer ot an array of ISO Latin 1 chars
51 * @inlen: the lenght of @in
52 *
Daniel Veillard891e4041998-10-19 00:43:02 +000053 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
54 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +000055 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +000056 */
Daniel Veillard97b58771998-10-20 06:14:16 +000057int
58isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000059{
60 unsigned char* outstart= out;
61 unsigned char* outend= out+outlen;
62 unsigned char* inend= in+inlen;
63 unsigned char c;
64
65 while (in < inend) {
66 c= *in++;
67 if (c < 0x80) {
68 if (out >= outend) return -1;
69 *out++ = c;
70 }
71 else {
72 if (out >= outend) return -1;
73 *out++ = 0xC0 | (c >> 6);
74 if (out >= outend) return -1;
75 *out++ = 0x80 | (0x3F & c);
76 }
77 }
78 return out-outstart;
79}
80
Daniel Veillard97b58771998-10-20 06:14:16 +000081/**
82 * UTF8Toisolat1:
83 * @out: a pointer ot an array of bytes to store the result
84 * @outlen: the lenght of @out
85 * @in: a pointer ot an array of UTF-8 chars
86 * @inlen: the lenght of @in
87 *
Daniel Veillard891e4041998-10-19 00:43:02 +000088 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
89 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +000090 * TODO: UTF8Toisolat1 need a fallback mechanism ...
91 *
Daniel Veillard1e346af1999-02-22 10:33:01 +000092 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +000093 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +000094 */
Daniel Veillard97b58771998-10-20 06:14:16 +000095int
96UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000097{
98 unsigned char* outstart= out;
99 unsigned char* outend= out+outlen;
100 unsigned char* inend= in+inlen;
Daniel Veillardccb09631998-10-27 06:21:04 +0000101 unsigned char c;
Daniel Veillard891e4041998-10-19 00:43:02 +0000102
103 while (in < inend) {
104 c= *in++;
105 if (c < 0x80) {
106 if (out >= outend) return -1;
107 *out++= c;
108 }
109 else if (((c & 0xFE) == 0xC2) && in<inend) {
110 if (out >= outend) return -1;
111 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
112 }
113 else return -2;
114 }
115 return out-outstart;
116}
117
Daniel Veillard97b58771998-10-20 06:14:16 +0000118/**
119 * UTF16ToUTF8:
120 * @out: a pointer ot an array of bytes to store the result
121 * @outlen: the lenght of @out
122 * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
123 * @inlen: the lenght of @in
124 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000125 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
126 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +0000127 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000128 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000129int
130UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000131{
132 unsigned char* outstart= out;
133 unsigned char* outend= out+outlen;
134 unsigned short* inend= in+inlen;
135 unsigned int c, d;
136 int bits;
137
138 while (in < inend) {
139 c= *in++;
140 if ((c & 0xFC00) == 0xD800) { /* surrogates */
141 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
142 c &= 0x03FF;
143 c <<= 10;
144 c |= d & 0x03FF;
145 c += 0x10000;
146 }
147 else return -1;
148 }
149
150 /* assertion: c is a single UTF-4 value */
151
152 if (out >= outend) return -1;
153 if (c < 0x80) { *out++= c; bits= -6; }
154 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
155 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
156 else { *out++= (c >> 18) | 0xF0; bits= 12; }
157
158 for ( ; bits < 0; bits-= 6) {
159 if (out >= outend) return -1;
160 *out++= (c >> bits) & 0x3F;
161 }
162 }
163 return out-outstart;
164}
165
Daniel Veillard97b58771998-10-20 06:14:16 +0000166/**
167 * UTF8ToUTF16:
168 * @out: a pointer ot an array of shorts to store the result
169 * @outlen: the lenght of @out (number of shorts)
170 * @in: a pointer ot an array of UTF-8 chars
171 * @inlen: the lenght of @in
172 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000173 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
174 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000175 * TODO: UTF8ToUTF16 need a fallback mechanism ...
176 *
Daniel Veillard1e346af1999-02-22 10:33:01 +0000177 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +0000178 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000179 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000180int
181UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000182{
183 unsigned short* outstart= out;
184 unsigned short* outend= out+outlen;
185 unsigned char* inend= in+inlen;
186 unsigned int c, d, trailing;
187
188 while (in < inend) {
189 d= *in++;
190 if (d < 0x80) { c= d; trailing= 0; }
191 else if (d < 0xC0) return -2; /* trailing byte in leading position */
192 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
193 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
194 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
195 else return -2; /* no chance for this in UTF-16 */
196
197 for ( ; trailing; trailing--) {
198 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
199 c <<= 6;
200 c |= d & 0x3F;
201 }
202
203 /* assertion: c is a single UTF-4 value */
204 if (c < 0x10000) {
205 if (out >= outend) return -1;
206 *out++ = c;
207 }
208 else if (c < 0x110000) {
209 if (out+1 >= outend) return -1;
210 c -= 0x10000;
211 *out++ = 0xD800 | (c >> 10);
212 *out++ = 0xDC00 | (c & 0x03FF);
213 }
214 else return -1;
215 }
216 return out-outstart;
217}
218
Daniel Veillardb05deb71999-08-10 19:04:08 +0000219#endif /* ! HAVE_UNICODE_H */
Daniel Veillard97b58771998-10-20 06:14:16 +0000220
Daniel Veillard27d88741999-05-29 11:51:49 +0000221/**
222 * xmlDetectCharEncoding:
223 * @in: a pointer to the first bytes of the XML entity, must be at least
224 * 4 bytes long.
225 *
226 * Guess the encoding of the entity using the first bytes of the entity content
227 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
228 *
229 * Returns one of the XML_CHAR_ENCODING_... values.
230 */
231xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000232xmlDetectCharEncoding(const unsigned char* in)
Daniel Veillard27d88741999-05-29 11:51:49 +0000233{
234 if ((in[0] == 0x00) && (in[1] == 0x00) &&
235 (in[2] == 0x00) && (in[3] == 0x3C))
236 return(XML_CHAR_ENCODING_UCS4BE);
237 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
238 (in[2] == 0x00) && (in[3] == 0x00))
239 return(XML_CHAR_ENCODING_UCS4LE);
240 if ((in[0] == 0x00) && (in[1] == 0x00) &&
241 (in[2] == 0x3C) && (in[3] == 0x00))
242 return(XML_CHAR_ENCODING_UCS4_2143);
243 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
244 (in[2] == 0x00) && (in[3] == 0x00))
245 return(XML_CHAR_ENCODING_UCS4_3412);
246 if ((in[0] == 0xFE) && (in[1] == 0xFF))
247 return(XML_CHAR_ENCODING_UTF16BE);
248 if ((in[0] == 0xFF) && (in[1] == 0xFE))
249 return(XML_CHAR_ENCODING_UTF16LE);
250 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
251 (in[2] == 0xA7) && (in[3] == 0x94))
252 return(XML_CHAR_ENCODING_EBCDIC);
253 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
254 (in[2] == 0x78) && (in[3] == 0x6D))
255 return(XML_CHAR_ENCODING_UTF8);
256 return(XML_CHAR_ENCODING_NONE);
257}
258
259/**
260 * xmlParseCharEncoding:
261 * @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
262 *
263 * Conpare the string to the known encoding schemes already known. Note
264 * that the comparison is case insensitive accordingly to the section
265 * [XML] 4.3.3 Character Encoding in Entities.
266 *
267 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
268 * if not recognized.
269 */
270xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000271xmlParseCharEncoding(const char* name)
Daniel Veillard27d88741999-05-29 11:51:49 +0000272{
273 char upper[500];
274 int i;
275
276 for (i = 0;i < 499;i++) {
277 upper[i] = toupper(name[i]);
278 if (upper[i] == 0) break;
279 }
280 upper[i] = 0;
281
282 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
283 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
284 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
285
286 /*
287 * NOTE: if we were able to parse this, the endianness of UTF16 is
288 * already found and in use
289 */
290 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
291 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
292
293 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
294 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
295 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
296
297 /*
298 * NOTE: if we were able to parse this, the endianness of UCS4 is
299 * already found and in use
300 */
301 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
302 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
303 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
304
305
306 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
307 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
308 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
309
310 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
311 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
312 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
313
314 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
315 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
316 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
317 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
318 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
319 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
320 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
321
322 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
323 if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
324 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
325 return(XML_CHAR_ENCODING_ERROR);
326}
Daniel Veillard14fff061999-06-22 21:49:07 +0000327
328/****************************************************************
329 * *
330 * Char encoding handlers *
331 * *
332 ****************************************************************/
333
334/* the size should be growable, but it's not a big deal ... */
335#define MAX_ENCODING_HANDLERS 50
336static xmlCharEncodingHandlerPtr *handlers = NULL;
337static int nbCharEncodingHandler = 0;
338
339/*
340 * The default is UTF-8 for XML, that's also the default used for the
341 * parser internals, so the default encoding handler is NULL
342 */
343
344static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
345
346/**
347 * xmlNewCharEncodingHandler:
348 * @name: the encoding name, in UTF-8 format (ASCCI actually)
349 * @input: the xmlCharEncodingInputFunc to read that encoding
350 * @output: the xmlCharEncodingOutputFunc to write that encoding
351 *
352 * Create and registers an xmlCharEncodingHandler.
353 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
354 */
355xmlCharEncodingHandlerPtr
356xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input,
357 xmlCharEncodingOutputFunc output) {
358 xmlCharEncodingHandlerPtr handler;
359 char upper[500];
360 int i;
361 char *up = 0;
362
363 /*
364 * Keep only the uppercase version of the encoding.
365 */
366 if (name == NULL) {
367 fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
368 return(NULL);
369 }
370 for (i = 0;i < 499;i++) {
371 upper[i] = toupper(name[i]);
372 if (upper[i] == 0) break;
373 }
374 upper[i] = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +0000375 up = xmlMemStrdup(upper);
Daniel Veillard14fff061999-06-22 21:49:07 +0000376 if (up == NULL) {
377 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
378 return(NULL);
379 }
380
381 /*
382 * allocate and fill-up an handler block.
383 */
384 handler = (xmlCharEncodingHandlerPtr)
Daniel Veillard6454aec1999-09-02 22:04:43 +0000385 xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard14fff061999-06-22 21:49:07 +0000386 if (handler == NULL) {
387 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
388 return(NULL);
389 }
390 handler->input = input;
391 handler->output = output;
392 handler->name = up;
393
394 /*
395 * registers and returns the handler.
396 */
397 xmlRegisterCharEncodingHandler(handler);
398 return(handler);
399}
400
401/**
402 * xmlInitCharEncodingHandlers:
403 *
404 * Initialize the char encoding support, it registers the default
405 * encoding supported.
406 * NOTE: while public theis function usually don't need to be called
407 * in normal processing.
408 */
409void
410xmlInitCharEncodingHandlers(void) {
411 if (handlers != NULL) return;
412
413 handlers = (xmlCharEncodingHandlerPtr *)
Daniel Veillard6454aec1999-09-02 22:04:43 +0000414 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
Daniel Veillard14fff061999-06-22 21:49:07 +0000415
416 if (handlers == NULL) {
417 fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
418 return;
419 }
420 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillardb05deb71999-08-10 19:04:08 +0000421#ifdef HAVE_UNICODE_H
422#else
Daniel Veillardb96e6431999-08-29 21:02:19 +0000423 /* xmlNewCharEncodingHandler("UTF-16", UTF16ToUTF8, UTF8ToUTF16); */
Daniel Veillard14fff061999-06-22 21:49:07 +0000424 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
Daniel Veillardb05deb71999-08-10 19:04:08 +0000425#endif
Daniel Veillard14fff061999-06-22 21:49:07 +0000426}
427
428/**
429 * xmlRegisterCharEncodingHandler:
430 * @handler: the xmlCharEncodingHandlerPtr handler block
431 *
432 * Register the char encoding handler, surprizing, isn't it ?
433 */
434void
435xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
436 if (handlers == NULL) xmlInitCharEncodingHandlers();
437 if (handler == NULL) {
438 fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
439 return;
440 }
441
442 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
443 fprintf(stderr,
444 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
445 fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
446 return;
447 }
448 handlers[nbCharEncodingHandler++] = handler;
449}
450
451/**
452 * xmlGetCharEncodingHandler:
453 * @enc: an xmlCharEncoding value.
454 *
455 * Search in the registrered set the handler able to read/write that encoding.
456 *
457 * Returns the handler or NULL if not found
458 */
459xmlCharEncodingHandlerPtr
460xmlGetCharEncodingHandler(xmlCharEncoding enc) {
461 if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillardb96e6431999-08-29 21:02:19 +0000462 /* TODO xmlGetCharEncodingHandler !!!!!!! */
Daniel Veillard14fff061999-06-22 21:49:07 +0000463 return(NULL);
464}
465
466/**
467 * xmlGetCharEncodingHandler:
468 * @enc: a string describing the char encoding.
469 *
470 * Search in the registrered set the handler able to read/write that encoding.
471 *
472 * Returns the handler or NULL if not found
473 */
474xmlCharEncodingHandlerPtr
475xmlFindCharEncodingHandler(const char *name) {
476 char upper[500];
477 int i;
478
479 if (handlers == NULL) xmlInitCharEncodingHandlers();
480 if (name == NULL) return(xmlDefaultCharEncodingHandler);
481 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
482
483 for (i = 0;i < 499;i++) {
484 upper[i] = toupper(name[i]);
485 if (upper[i] == 0) break;
486 }
487 upper[i] = 0;
488
489 for (i = 0;i < nbCharEncodingHandler; i++)
490 if (!strcmp(name, handlers[i]->name))
491 return(handlers[i]);
492
493 return(NULL);
494}
495