blob: 3d65469edd6ad269f58901b5938f0ae065660980 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
Daniel Veillard14fff061999-06-22 21:49:07 +000015 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard891e4041998-10-19 00:43:02 +000016 *
17 * See Copyright for the status of this software.
18 *
Daniel Veillard891e4041998-10-19 00:43:02 +000019 * Daniel.Veillard@w3.org
20 */
21
Daniel Veillardb96e6431999-08-29 21:02:19 +000022#include "config.h"
Daniel Veillard27d88741999-05-29 11:51:49 +000023#include <ctype.h>
Daniel Veillard14fff061999-06-22 21:49:07 +000024#include <string.h>
25#include <stdio.h>
Daniel Veillardb96e6431999-08-29 21:02:19 +000026#ifdef HAVE_MALLOC_H
27#include <malloc.h>
28#endif
Daniel Veillard891e4041998-10-19 00:43:02 +000029#include "encoding.h"
Daniel Veillardb05deb71999-08-10 19:04:08 +000030#ifdef HAVE_UNICODE_H
31#include <unicode.h>
32#endif
Daniel Veillard891e4041998-10-19 00:43:02 +000033
Daniel Veillardb05deb71999-08-10 19:04:08 +000034#ifdef HAVE_UNICODE_H
35
36#else /* ! HAVE_UNICODE_H */
Daniel Veillard0ba4d531998-11-01 19:34:31 +000037/*
38 * From rfc2044: encoding of the Unicode values on UTF-8:
39 *
40 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
41 * 0000 0000-0000 007F 0xxxxxxx
42 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
43 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
44 *
45 * I hope we won't use values > 0xFFFF anytime soon !
46 */
47
Daniel Veillard97b58771998-10-20 06:14:16 +000048/**
49 * isolat1ToUTF8:
50 * @out: a pointer ot an array of bytes to store the result
51 * @outlen: the lenght of @out
52 * @in: a pointer ot an array of ISO Latin 1 chars
53 * @inlen: the lenght of @in
54 *
Daniel Veillard891e4041998-10-19 00:43:02 +000055 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
56 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +000057 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +000058 */
Daniel Veillard97b58771998-10-20 06:14:16 +000059int
60isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000061{
62 unsigned char* outstart= out;
63 unsigned char* outend= out+outlen;
64 unsigned char* inend= in+inlen;
65 unsigned char c;
66
67 while (in < inend) {
68 c= *in++;
69 if (c < 0x80) {
70 if (out >= outend) return -1;
71 *out++ = c;
72 }
73 else {
74 if (out >= outend) return -1;
75 *out++ = 0xC0 | (c >> 6);
76 if (out >= outend) return -1;
77 *out++ = 0x80 | (0x3F & c);
78 }
79 }
80 return out-outstart;
81}
82
Daniel Veillard97b58771998-10-20 06:14:16 +000083/**
84 * UTF8Toisolat1:
85 * @out: a pointer ot an array of bytes to store the result
86 * @outlen: the lenght of @out
87 * @in: a pointer ot an array of UTF-8 chars
88 * @inlen: the lenght of @in
89 *
Daniel Veillard891e4041998-10-19 00:43:02 +000090 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
91 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +000092 * TODO: UTF8Toisolat1 need a fallback mechanism ...
93 *
Daniel Veillard1e346af1999-02-22 10:33:01 +000094 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +000095 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +000096 */
Daniel Veillard97b58771998-10-20 06:14:16 +000097int
98UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000099{
100 unsigned char* outstart= out;
101 unsigned char* outend= out+outlen;
102 unsigned char* inend= in+inlen;
Daniel Veillardccb09631998-10-27 06:21:04 +0000103 unsigned char c;
Daniel Veillard891e4041998-10-19 00:43:02 +0000104
105 while (in < inend) {
106 c= *in++;
107 if (c < 0x80) {
108 if (out >= outend) return -1;
109 *out++= c;
110 }
111 else if (((c & 0xFE) == 0xC2) && in<inend) {
112 if (out >= outend) return -1;
113 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
114 }
115 else return -2;
116 }
117 return out-outstart;
118}
119
Daniel Veillard97b58771998-10-20 06:14:16 +0000120/**
121 * UTF16ToUTF8:
122 * @out: a pointer ot an array of bytes to store the result
123 * @outlen: the lenght of @out
124 * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
125 * @inlen: the lenght of @in
126 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000127 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
128 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +0000129 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000130 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000131int
132UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000133{
134 unsigned char* outstart= out;
135 unsigned char* outend= out+outlen;
136 unsigned short* inend= in+inlen;
137 unsigned int c, d;
138 int bits;
139
140 while (in < inend) {
141 c= *in++;
142 if ((c & 0xFC00) == 0xD800) { /* surrogates */
143 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
144 c &= 0x03FF;
145 c <<= 10;
146 c |= d & 0x03FF;
147 c += 0x10000;
148 }
149 else return -1;
150 }
151
152 /* assertion: c is a single UTF-4 value */
153
154 if (out >= outend) return -1;
155 if (c < 0x80) { *out++= c; bits= -6; }
156 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
157 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
158 else { *out++= (c >> 18) | 0xF0; bits= 12; }
159
160 for ( ; bits < 0; bits-= 6) {
161 if (out >= outend) return -1;
162 *out++= (c >> bits) & 0x3F;
163 }
164 }
165 return out-outstart;
166}
167
Daniel Veillard97b58771998-10-20 06:14:16 +0000168/**
169 * UTF8ToUTF16:
170 * @out: a pointer ot an array of shorts to store the result
171 * @outlen: the lenght of @out (number of shorts)
172 * @in: a pointer ot an array of UTF-8 chars
173 * @inlen: the lenght of @in
174 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000175 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
176 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000177 * TODO: UTF8ToUTF16 need a fallback mechanism ...
178 *
Daniel Veillard1e346af1999-02-22 10:33:01 +0000179 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +0000180 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000181 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000182int
183UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000184{
185 unsigned short* outstart= out;
186 unsigned short* outend= out+outlen;
187 unsigned char* inend= in+inlen;
188 unsigned int c, d, trailing;
189
190 while (in < inend) {
191 d= *in++;
192 if (d < 0x80) { c= d; trailing= 0; }
193 else if (d < 0xC0) return -2; /* trailing byte in leading position */
194 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
195 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
196 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
197 else return -2; /* no chance for this in UTF-16 */
198
199 for ( ; trailing; trailing--) {
200 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
201 c <<= 6;
202 c |= d & 0x3F;
203 }
204
205 /* assertion: c is a single UTF-4 value */
206 if (c < 0x10000) {
207 if (out >= outend) return -1;
208 *out++ = c;
209 }
210 else if (c < 0x110000) {
211 if (out+1 >= outend) return -1;
212 c -= 0x10000;
213 *out++ = 0xD800 | (c >> 10);
214 *out++ = 0xDC00 | (c & 0x03FF);
215 }
216 else return -1;
217 }
218 return out-outstart;
219}
220
Daniel Veillardb05deb71999-08-10 19:04:08 +0000221#endif /* ! HAVE_UNICODE_H */
Daniel Veillard97b58771998-10-20 06:14:16 +0000222
Daniel Veillard27d88741999-05-29 11:51:49 +0000223/**
224 * xmlDetectCharEncoding:
225 * @in: a pointer to the first bytes of the XML entity, must be at least
226 * 4 bytes long.
227 *
228 * Guess the encoding of the entity using the first bytes of the entity content
229 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
230 *
231 * Returns one of the XML_CHAR_ENCODING_... values.
232 */
233xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000234xmlDetectCharEncoding(const unsigned char* in)
Daniel Veillard27d88741999-05-29 11:51:49 +0000235{
236 if ((in[0] == 0x00) && (in[1] == 0x00) &&
237 (in[2] == 0x00) && (in[3] == 0x3C))
238 return(XML_CHAR_ENCODING_UCS4BE);
239 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
240 (in[2] == 0x00) && (in[3] == 0x00))
241 return(XML_CHAR_ENCODING_UCS4LE);
242 if ((in[0] == 0x00) && (in[1] == 0x00) &&
243 (in[2] == 0x3C) && (in[3] == 0x00))
244 return(XML_CHAR_ENCODING_UCS4_2143);
245 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
246 (in[2] == 0x00) && (in[3] == 0x00))
247 return(XML_CHAR_ENCODING_UCS4_3412);
248 if ((in[0] == 0xFE) && (in[1] == 0xFF))
249 return(XML_CHAR_ENCODING_UTF16BE);
250 if ((in[0] == 0xFF) && (in[1] == 0xFE))
251 return(XML_CHAR_ENCODING_UTF16LE);
252 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
253 (in[2] == 0xA7) && (in[3] == 0x94))
254 return(XML_CHAR_ENCODING_EBCDIC);
255 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
256 (in[2] == 0x78) && (in[3] == 0x6D))
257 return(XML_CHAR_ENCODING_UTF8);
258 return(XML_CHAR_ENCODING_NONE);
259}
260
261/**
262 * xmlParseCharEncoding:
263 * @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
264 *
265 * Conpare the string to the known encoding schemes already known. Note
266 * that the comparison is case insensitive accordingly to the section
267 * [XML] 4.3.3 Character Encoding in Entities.
268 *
269 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
270 * if not recognized.
271 */
272xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000273xmlParseCharEncoding(const char* name)
Daniel Veillard27d88741999-05-29 11:51:49 +0000274{
275 char upper[500];
276 int i;
277
278 for (i = 0;i < 499;i++) {
279 upper[i] = toupper(name[i]);
280 if (upper[i] == 0) break;
281 }
282 upper[i] = 0;
283
284 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
285 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
286 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
287
288 /*
289 * NOTE: if we were able to parse this, the endianness of UTF16 is
290 * already found and in use
291 */
292 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
293 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
294
295 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
296 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
297 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
298
299 /*
300 * NOTE: if we were able to parse this, the endianness of UCS4 is
301 * already found and in use
302 */
303 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
304 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
305 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
306
307
308 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
309 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
310 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
311
312 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
313 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
314 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
315
316 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
317 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
318 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
319 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
320 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
321 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
322 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
323
324 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
325 if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
326 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
327 return(XML_CHAR_ENCODING_ERROR);
328}
Daniel Veillard14fff061999-06-22 21:49:07 +0000329
330/****************************************************************
331 * *
332 * Char encoding handlers *
333 * *
334 ****************************************************************/
335
336/* the size should be growable, but it's not a big deal ... */
337#define MAX_ENCODING_HANDLERS 50
338static xmlCharEncodingHandlerPtr *handlers = NULL;
339static int nbCharEncodingHandler = 0;
340
341/*
342 * The default is UTF-8 for XML, that's also the default used for the
343 * parser internals, so the default encoding handler is NULL
344 */
345
346static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
347
348/**
349 * xmlNewCharEncodingHandler:
350 * @name: the encoding name, in UTF-8 format (ASCCI actually)
351 * @input: the xmlCharEncodingInputFunc to read that encoding
352 * @output: the xmlCharEncodingOutputFunc to write that encoding
353 *
354 * Create and registers an xmlCharEncodingHandler.
355 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
356 */
357xmlCharEncodingHandlerPtr
358xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input,
359 xmlCharEncodingOutputFunc output) {
360 xmlCharEncodingHandlerPtr handler;
361 char upper[500];
362 int i;
363 char *up = 0;
364
365 /*
366 * Keep only the uppercase version of the encoding.
367 */
368 if (name == NULL) {
369 fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
370 return(NULL);
371 }
372 for (i = 0;i < 499;i++) {
373 upper[i] = toupper(name[i]);
374 if (upper[i] == 0) break;
375 }
376 upper[i] = 0;
377 up = strdup(upper);
378 if (up == NULL) {
379 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
380 return(NULL);
381 }
382
383 /*
384 * allocate and fill-up an handler block.
385 */
386 handler = (xmlCharEncodingHandlerPtr)
387 malloc(sizeof(xmlCharEncodingHandler));
388 if (handler == NULL) {
389 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
390 return(NULL);
391 }
392 handler->input = input;
393 handler->output = output;
394 handler->name = up;
395
396 /*
397 * registers and returns the handler.
398 */
399 xmlRegisterCharEncodingHandler(handler);
400 return(handler);
401}
402
403/**
404 * xmlInitCharEncodingHandlers:
405 *
406 * Initialize the char encoding support, it registers the default
407 * encoding supported.
408 * NOTE: while public theis function usually don't need to be called
409 * in normal processing.
410 */
411void
412xmlInitCharEncodingHandlers(void) {
413 if (handlers != NULL) return;
414
415 handlers = (xmlCharEncodingHandlerPtr *)
416 malloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
417
418 if (handlers == NULL) {
419 fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
420 return;
421 }
422 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillardb05deb71999-08-10 19:04:08 +0000423#ifdef HAVE_UNICODE_H
424#else
Daniel Veillardb96e6431999-08-29 21:02:19 +0000425 /* xmlNewCharEncodingHandler("UTF-16", UTF16ToUTF8, UTF8ToUTF16); */
Daniel Veillard14fff061999-06-22 21:49:07 +0000426 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
Daniel Veillardb05deb71999-08-10 19:04:08 +0000427#endif
Daniel Veillard14fff061999-06-22 21:49:07 +0000428}
429
430/**
431 * xmlRegisterCharEncodingHandler:
432 * @handler: the xmlCharEncodingHandlerPtr handler block
433 *
434 * Register the char encoding handler, surprizing, isn't it ?
435 */
436void
437xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
438 if (handlers == NULL) xmlInitCharEncodingHandlers();
439 if (handler == NULL) {
440 fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
441 return;
442 }
443
444 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
445 fprintf(stderr,
446 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
447 fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
448 return;
449 }
450 handlers[nbCharEncodingHandler++] = handler;
451}
452
453/**
454 * xmlGetCharEncodingHandler:
455 * @enc: an xmlCharEncoding value.
456 *
457 * Search in the registrered set the handler able to read/write that encoding.
458 *
459 * Returns the handler or NULL if not found
460 */
461xmlCharEncodingHandlerPtr
462xmlGetCharEncodingHandler(xmlCharEncoding enc) {
463 if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillardb96e6431999-08-29 21:02:19 +0000464 /* TODO xmlGetCharEncodingHandler !!!!!!! */
Daniel Veillard14fff061999-06-22 21:49:07 +0000465 return(NULL);
466}
467
468/**
469 * xmlGetCharEncodingHandler:
470 * @enc: a string describing the char encoding.
471 *
472 * Search in the registrered set the handler able to read/write that encoding.
473 *
474 * Returns the handler or NULL if not found
475 */
476xmlCharEncodingHandlerPtr
477xmlFindCharEncodingHandler(const char *name) {
478 char upper[500];
479 int i;
480
481 if (handlers == NULL) xmlInitCharEncodingHandlers();
482 if (name == NULL) return(xmlDefaultCharEncodingHandler);
483 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
484
485 for (i = 0;i < 499;i++) {
486 upper[i] = toupper(name[i]);
487 if (upper[i] == 0) break;
488 }
489 upper[i] = 0;
490
491 for (i = 0;i < nbCharEncodingHandler; i++)
492 if (!strcmp(name, handlers[i]->name))
493 return(handlers[i]);
494
495 return(NULL);
496}
497