blob: 5169cdea493a13b037e0538fe7b2484f13505bdd [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
Daniel Veillard14fff061999-06-22 21:49:07 +000015 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard891e4041998-10-19 00:43:02 +000016 *
17 * See Copyright for the status of this software.
18 *
Daniel Veillard891e4041998-10-19 00:43:02 +000019 * Daniel.Veillard@w3.org
20 */
21
Daniel Veillard7f7d1111999-09-22 09:46:25 +000022#ifndef WIN32
Daniel Veillardb96e6431999-08-29 21:02:19 +000023#include "config.h"
Daniel Veillard7f7d1111999-09-22 09:46:25 +000024#endif
25
Daniel Veillard14fff061999-06-22 21:49:07 +000026#include <stdio.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000027#include <string.h>
28
29#ifdef HAVE_CTYPE_H
30#include <ctype.h>
31#endif
Daniel Veillard891e4041998-10-19 00:43:02 +000032#include "encoding.h"
Daniel Veillardb05deb71999-08-10 19:04:08 +000033#ifdef HAVE_UNICODE_H
34#include <unicode.h>
35#endif
Daniel Veillard6454aec1999-09-02 22:04:43 +000036#include "xmlmemory.h"
Daniel Veillard891e4041998-10-19 00:43:02 +000037
Daniel Veillardb05deb71999-08-10 19:04:08 +000038#ifdef HAVE_UNICODE_H
39
40#else /* ! HAVE_UNICODE_H */
Daniel Veillard0ba4d531998-11-01 19:34:31 +000041/*
42 * From rfc2044: encoding of the Unicode values on UTF-8:
43 *
44 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
45 * 0000 0000-0000 007F 0xxxxxxx
46 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
47 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
48 *
49 * I hope we won't use values > 0xFFFF anytime soon !
50 */
51
Daniel Veillard97b58771998-10-20 06:14:16 +000052/**
53 * isolat1ToUTF8:
54 * @out: a pointer ot an array of bytes to store the result
55 * @outlen: the lenght of @out
56 * @in: a pointer ot an array of ISO Latin 1 chars
57 * @inlen: the lenght of @in
58 *
Daniel Veillard891e4041998-10-19 00:43:02 +000059 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
60 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +000061 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +000062 */
Daniel Veillard97b58771998-10-20 06:14:16 +000063int
64isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000065{
66 unsigned char* outstart= out;
67 unsigned char* outend= out+outlen;
68 unsigned char* inend= in+inlen;
69 unsigned char c;
70
71 while (in < inend) {
72 c= *in++;
73 if (c < 0x80) {
74 if (out >= outend) return -1;
75 *out++ = c;
76 }
77 else {
78 if (out >= outend) return -1;
79 *out++ = 0xC0 | (c >> 6);
80 if (out >= outend) return -1;
81 *out++ = 0x80 | (0x3F & c);
82 }
83 }
84 return out-outstart;
85}
86
Daniel Veillard97b58771998-10-20 06:14:16 +000087/**
88 * UTF8Toisolat1:
89 * @out: a pointer ot an array of bytes to store the result
90 * @outlen: the lenght of @out
91 * @in: a pointer ot an array of UTF-8 chars
92 * @inlen: the lenght of @in
93 *
Daniel Veillard891e4041998-10-19 00:43:02 +000094 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
95 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +000096 * TODO: UTF8Toisolat1 need a fallback mechanism ...
97 *
Daniel Veillard1e346af1999-02-22 10:33:01 +000098 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +000099 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000100 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000101int
102UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000103{
104 unsigned char* outstart= out;
105 unsigned char* outend= out+outlen;
106 unsigned char* inend= in+inlen;
Daniel Veillardccb09631998-10-27 06:21:04 +0000107 unsigned char c;
Daniel Veillard891e4041998-10-19 00:43:02 +0000108
109 while (in < inend) {
110 c= *in++;
111 if (c < 0x80) {
112 if (out >= outend) return -1;
113 *out++= c;
114 }
115 else if (((c & 0xFE) == 0xC2) && in<inend) {
116 if (out >= outend) return -1;
117 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
118 }
119 else return -2;
120 }
121 return out-outstart;
122}
123
Daniel Veillard97b58771998-10-20 06:14:16 +0000124/**
125 * UTF16ToUTF8:
126 * @out: a pointer ot an array of bytes to store the result
127 * @outlen: the lenght of @out
128 * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
129 * @inlen: the lenght of @in
130 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000131 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
132 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +0000133 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000134 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000135int
136UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000137{
138 unsigned char* outstart= out;
139 unsigned char* outend= out+outlen;
140 unsigned short* inend= in+inlen;
141 unsigned int c, d;
142 int bits;
143
144 while (in < inend) {
145 c= *in++;
146 if ((c & 0xFC00) == 0xD800) { /* surrogates */
147 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
148 c &= 0x03FF;
149 c <<= 10;
150 c |= d & 0x03FF;
151 c += 0x10000;
152 }
153 else return -1;
154 }
155
156 /* assertion: c is a single UTF-4 value */
157
158 if (out >= outend) return -1;
159 if (c < 0x80) { *out++= c; bits= -6; }
160 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
161 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
162 else { *out++= (c >> 18) | 0xF0; bits= 12; }
163
164 for ( ; bits < 0; bits-= 6) {
165 if (out >= outend) return -1;
166 *out++= (c >> bits) & 0x3F;
167 }
168 }
169 return out-outstart;
170}
171
Daniel Veillard97b58771998-10-20 06:14:16 +0000172/**
173 * UTF8ToUTF16:
174 * @out: a pointer ot an array of shorts to store the result
175 * @outlen: the lenght of @out (number of shorts)
176 * @in: a pointer ot an array of UTF-8 chars
177 * @inlen: the lenght of @in
178 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000179 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
180 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000181 * TODO: UTF8ToUTF16 need a fallback mechanism ...
182 *
Daniel Veillard1e346af1999-02-22 10:33:01 +0000183 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +0000184 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000185 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000186int
187UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000188{
189 unsigned short* outstart= out;
190 unsigned short* outend= out+outlen;
191 unsigned char* inend= in+inlen;
192 unsigned int c, d, trailing;
193
194 while (in < inend) {
195 d= *in++;
196 if (d < 0x80) { c= d; trailing= 0; }
197 else if (d < 0xC0) return -2; /* trailing byte in leading position */
198 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
199 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
200 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
201 else return -2; /* no chance for this in UTF-16 */
202
203 for ( ; trailing; trailing--) {
204 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
205 c <<= 6;
206 c |= d & 0x3F;
207 }
208
209 /* assertion: c is a single UTF-4 value */
210 if (c < 0x10000) {
211 if (out >= outend) return -1;
212 *out++ = c;
213 }
214 else if (c < 0x110000) {
215 if (out+1 >= outend) return -1;
216 c -= 0x10000;
217 *out++ = 0xD800 | (c >> 10);
218 *out++ = 0xDC00 | (c & 0x03FF);
219 }
220 else return -1;
221 }
222 return out-outstart;
223}
224
Daniel Veillardb05deb71999-08-10 19:04:08 +0000225#endif /* ! HAVE_UNICODE_H */
Daniel Veillard97b58771998-10-20 06:14:16 +0000226
Daniel Veillard27d88741999-05-29 11:51:49 +0000227/**
228 * xmlDetectCharEncoding:
229 * @in: a pointer to the first bytes of the XML entity, must be at least
230 * 4 bytes long.
231 *
232 * Guess the encoding of the entity using the first bytes of the entity content
233 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
234 *
235 * Returns one of the XML_CHAR_ENCODING_... values.
236 */
237xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000238xmlDetectCharEncoding(const unsigned char* in)
Daniel Veillard27d88741999-05-29 11:51:49 +0000239{
240 if ((in[0] == 0x00) && (in[1] == 0x00) &&
241 (in[2] == 0x00) && (in[3] == 0x3C))
242 return(XML_CHAR_ENCODING_UCS4BE);
243 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
244 (in[2] == 0x00) && (in[3] == 0x00))
245 return(XML_CHAR_ENCODING_UCS4LE);
246 if ((in[0] == 0x00) && (in[1] == 0x00) &&
247 (in[2] == 0x3C) && (in[3] == 0x00))
248 return(XML_CHAR_ENCODING_UCS4_2143);
249 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
250 (in[2] == 0x00) && (in[3] == 0x00))
251 return(XML_CHAR_ENCODING_UCS4_3412);
252 if ((in[0] == 0xFE) && (in[1] == 0xFF))
253 return(XML_CHAR_ENCODING_UTF16BE);
254 if ((in[0] == 0xFF) && (in[1] == 0xFE))
255 return(XML_CHAR_ENCODING_UTF16LE);
256 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
257 (in[2] == 0xA7) && (in[3] == 0x94))
258 return(XML_CHAR_ENCODING_EBCDIC);
259 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
260 (in[2] == 0x78) && (in[3] == 0x6D))
261 return(XML_CHAR_ENCODING_UTF8);
262 return(XML_CHAR_ENCODING_NONE);
263}
264
265/**
266 * xmlParseCharEncoding:
267 * @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
268 *
269 * Conpare the string to the known encoding schemes already known. Note
270 * that the comparison is case insensitive accordingly to the section
271 * [XML] 4.3.3 Character Encoding in Entities.
272 *
273 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
274 * if not recognized.
275 */
276xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000277xmlParseCharEncoding(const char* name)
Daniel Veillard27d88741999-05-29 11:51:49 +0000278{
279 char upper[500];
280 int i;
281
282 for (i = 0;i < 499;i++) {
283 upper[i] = toupper(name[i]);
284 if (upper[i] == 0) break;
285 }
286 upper[i] = 0;
287
288 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
289 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
290 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
291
292 /*
293 * NOTE: if we were able to parse this, the endianness of UTF16 is
294 * already found and in use
295 */
296 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
297 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
298
299 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
300 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
301 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
302
303 /*
304 * NOTE: if we were able to parse this, the endianness of UCS4 is
305 * already found and in use
306 */
307 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
308 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
309 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
310
311
312 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
313 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
314 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
315
316 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
317 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
318 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
319
320 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
321 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
322 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
323 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
324 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
325 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
326 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
327
328 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
329 if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
330 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
331 return(XML_CHAR_ENCODING_ERROR);
332}
Daniel Veillard14fff061999-06-22 21:49:07 +0000333
334/****************************************************************
335 * *
336 * Char encoding handlers *
337 * *
338 ****************************************************************/
339
340/* the size should be growable, but it's not a big deal ... */
341#define MAX_ENCODING_HANDLERS 50
342static xmlCharEncodingHandlerPtr *handlers = NULL;
343static int nbCharEncodingHandler = 0;
344
345/*
346 * The default is UTF-8 for XML, that's also the default used for the
347 * parser internals, so the default encoding handler is NULL
348 */
349
350static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
351
352/**
353 * xmlNewCharEncodingHandler:
354 * @name: the encoding name, in UTF-8 format (ASCCI actually)
355 * @input: the xmlCharEncodingInputFunc to read that encoding
356 * @output: the xmlCharEncodingOutputFunc to write that encoding
357 *
358 * Create and registers an xmlCharEncodingHandler.
359 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
360 */
361xmlCharEncodingHandlerPtr
362xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input,
363 xmlCharEncodingOutputFunc output) {
364 xmlCharEncodingHandlerPtr handler;
365 char upper[500];
366 int i;
367 char *up = 0;
368
369 /*
370 * Keep only the uppercase version of the encoding.
371 */
372 if (name == NULL) {
373 fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
374 return(NULL);
375 }
376 for (i = 0;i < 499;i++) {
377 upper[i] = toupper(name[i]);
378 if (upper[i] == 0) break;
379 }
380 upper[i] = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +0000381 up = xmlMemStrdup(upper);
Daniel Veillard14fff061999-06-22 21:49:07 +0000382 if (up == NULL) {
383 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
384 return(NULL);
385 }
386
387 /*
388 * allocate and fill-up an handler block.
389 */
390 handler = (xmlCharEncodingHandlerPtr)
Daniel Veillard6454aec1999-09-02 22:04:43 +0000391 xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard14fff061999-06-22 21:49:07 +0000392 if (handler == NULL) {
393 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
394 return(NULL);
395 }
396 handler->input = input;
397 handler->output = output;
398 handler->name = up;
399
400 /*
401 * registers and returns the handler.
402 */
403 xmlRegisterCharEncodingHandler(handler);
404 return(handler);
405}
406
407/**
408 * xmlInitCharEncodingHandlers:
409 *
410 * Initialize the char encoding support, it registers the default
411 * encoding supported.
412 * NOTE: while public theis function usually don't need to be called
413 * in normal processing.
414 */
415void
416xmlInitCharEncodingHandlers(void) {
417 if (handlers != NULL) return;
418
419 handlers = (xmlCharEncodingHandlerPtr *)
Daniel Veillard6454aec1999-09-02 22:04:43 +0000420 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
Daniel Veillard14fff061999-06-22 21:49:07 +0000421
422 if (handlers == NULL) {
423 fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
424 return;
425 }
426 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillardb05deb71999-08-10 19:04:08 +0000427#ifdef HAVE_UNICODE_H
428#else
Daniel Veillardb96e6431999-08-29 21:02:19 +0000429 /* xmlNewCharEncodingHandler("UTF-16", UTF16ToUTF8, UTF8ToUTF16); */
Daniel Veillard14fff061999-06-22 21:49:07 +0000430 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
Daniel Veillardb05deb71999-08-10 19:04:08 +0000431#endif
Daniel Veillard14fff061999-06-22 21:49:07 +0000432}
433
434/**
435 * xmlRegisterCharEncodingHandler:
436 * @handler: the xmlCharEncodingHandlerPtr handler block
437 *
438 * Register the char encoding handler, surprizing, isn't it ?
439 */
440void
441xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
442 if (handlers == NULL) xmlInitCharEncodingHandlers();
443 if (handler == NULL) {
444 fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
445 return;
446 }
447
448 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
449 fprintf(stderr,
450 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
451 fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
452 return;
453 }
454 handlers[nbCharEncodingHandler++] = handler;
455}
456
457/**
458 * xmlGetCharEncodingHandler:
459 * @enc: an xmlCharEncoding value.
460 *
461 * Search in the registrered set the handler able to read/write that encoding.
462 *
463 * Returns the handler or NULL if not found
464 */
465xmlCharEncodingHandlerPtr
466xmlGetCharEncodingHandler(xmlCharEncoding enc) {
467 if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillardb96e6431999-08-29 21:02:19 +0000468 /* TODO xmlGetCharEncodingHandler !!!!!!! */
Daniel Veillard14fff061999-06-22 21:49:07 +0000469 return(NULL);
470}
471
472/**
473 * xmlGetCharEncodingHandler:
474 * @enc: a string describing the char encoding.
475 *
476 * Search in the registrered set the handler able to read/write that encoding.
477 *
478 * Returns the handler or NULL if not found
479 */
480xmlCharEncodingHandlerPtr
481xmlFindCharEncodingHandler(const char *name) {
482 char upper[500];
483 int i;
484
485 if (handlers == NULL) xmlInitCharEncodingHandlers();
486 if (name == NULL) return(xmlDefaultCharEncodingHandler);
487 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
488
489 for (i = 0;i < 499;i++) {
490 upper[i] = toupper(name[i]);
491 if (upper[i] == 0) break;
492 }
493 upper[i] = 0;
494
495 for (i = 0;i < nbCharEncodingHandler; i++)
496 if (!strcmp(name, handlers[i]->name))
497 return(handlers[i]);
498
499 return(NULL);
500}
501