blob: 1771ec1c7d5ad42dc765c6f8219ae0b4120a4b26 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
Daniel Veillard14fff061999-06-22 21:49:07 +000015 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard891e4041998-10-19 00:43:02 +000016 *
17 * See Copyright for the status of this software.
18 *
Daniel Veillard891e4041998-10-19 00:43:02 +000019 * Daniel.Veillard@w3.org
20 */
21
Daniel Veillard27d88741999-05-29 11:51:49 +000022#include <ctype.h>
Daniel Veillard14fff061999-06-22 21:49:07 +000023#include <string.h>
24#include <stdio.h>
Daniel Veillard891e4041998-10-19 00:43:02 +000025#include "encoding.h"
Daniel Veillardb05deb71999-08-10 19:04:08 +000026#ifdef HAVE_UNICODE_H
27#include <unicode.h>
28#endif
Daniel Veillard891e4041998-10-19 00:43:02 +000029
Daniel Veillardb05deb71999-08-10 19:04:08 +000030#ifdef HAVE_UNICODE_H
31
32#else /* ! HAVE_UNICODE_H */
Daniel Veillard0ba4d531998-11-01 19:34:31 +000033/*
34 * From rfc2044: encoding of the Unicode values on UTF-8:
35 *
36 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
37 * 0000 0000-0000 007F 0xxxxxxx
38 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
39 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
40 *
41 * I hope we won't use values > 0xFFFF anytime soon !
42 */
43
Daniel Veillard97b58771998-10-20 06:14:16 +000044/**
45 * isolat1ToUTF8:
46 * @out: a pointer ot an array of bytes to store the result
47 * @outlen: the lenght of @out
48 * @in: a pointer ot an array of ISO Latin 1 chars
49 * @inlen: the lenght of @in
50 *
Daniel Veillard891e4041998-10-19 00:43:02 +000051 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
52 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +000053 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +000054 */
Daniel Veillard97b58771998-10-20 06:14:16 +000055int
56isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000057{
58 unsigned char* outstart= out;
59 unsigned char* outend= out+outlen;
60 unsigned char* inend= in+inlen;
61 unsigned char c;
62
63 while (in < inend) {
64 c= *in++;
65 if (c < 0x80) {
66 if (out >= outend) return -1;
67 *out++ = c;
68 }
69 else {
70 if (out >= outend) return -1;
71 *out++ = 0xC0 | (c >> 6);
72 if (out >= outend) return -1;
73 *out++ = 0x80 | (0x3F & c);
74 }
75 }
76 return out-outstart;
77}
78
Daniel Veillard97b58771998-10-20 06:14:16 +000079/**
80 * UTF8Toisolat1:
81 * @out: a pointer ot an array of bytes to store the result
82 * @outlen: the lenght of @out
83 * @in: a pointer ot an array of UTF-8 chars
84 * @inlen: the lenght of @in
85 *
Daniel Veillard891e4041998-10-19 00:43:02 +000086 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
87 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +000088 * TODO: need a fallback mechanism ...
Daniel Veillard1e346af1999-02-22 10:33:01 +000089 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +000090 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +000091 */
Daniel Veillard97b58771998-10-20 06:14:16 +000092int
93UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000094{
95 unsigned char* outstart= out;
96 unsigned char* outend= out+outlen;
97 unsigned char* inend= in+inlen;
Daniel Veillardccb09631998-10-27 06:21:04 +000098 unsigned char c;
Daniel Veillard891e4041998-10-19 00:43:02 +000099
100 while (in < inend) {
101 c= *in++;
102 if (c < 0x80) {
103 if (out >= outend) return -1;
104 *out++= c;
105 }
106 else if (((c & 0xFE) == 0xC2) && in<inend) {
107 if (out >= outend) return -1;
108 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
109 }
110 else return -2;
111 }
112 return out-outstart;
113}
114
Daniel Veillard97b58771998-10-20 06:14:16 +0000115/**
116 * UTF16ToUTF8:
117 * @out: a pointer ot an array of bytes to store the result
118 * @outlen: the lenght of @out
119 * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
120 * @inlen: the lenght of @in
121 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000122 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
123 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +0000124 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000125 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000126int
127UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000128{
129 unsigned char* outstart= out;
130 unsigned char* outend= out+outlen;
131 unsigned short* inend= in+inlen;
132 unsigned int c, d;
133 int bits;
134
135 while (in < inend) {
136 c= *in++;
137 if ((c & 0xFC00) == 0xD800) { /* surrogates */
138 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
139 c &= 0x03FF;
140 c <<= 10;
141 c |= d & 0x03FF;
142 c += 0x10000;
143 }
144 else return -1;
145 }
146
147 /* assertion: c is a single UTF-4 value */
148
149 if (out >= outend) return -1;
150 if (c < 0x80) { *out++= c; bits= -6; }
151 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
152 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
153 else { *out++= (c >> 18) | 0xF0; bits= 12; }
154
155 for ( ; bits < 0; bits-= 6) {
156 if (out >= outend) return -1;
157 *out++= (c >> bits) & 0x3F;
158 }
159 }
160 return out-outstart;
161}
162
Daniel Veillard97b58771998-10-20 06:14:16 +0000163/**
164 * UTF8ToUTF16:
165 * @out: a pointer ot an array of shorts to store the result
166 * @outlen: the lenght of @out (number of shorts)
167 * @in: a pointer ot an array of UTF-8 chars
168 * @inlen: the lenght of @in
169 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000170 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
171 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +0000172 * TODO: need a fallback mechanism ...
Daniel Veillard1e346af1999-02-22 10:33:01 +0000173 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +0000174 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000175 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000176int
177UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000178{
179 unsigned short* outstart= out;
180 unsigned short* outend= out+outlen;
181 unsigned char* inend= in+inlen;
182 unsigned int c, d, trailing;
183
184 while (in < inend) {
185 d= *in++;
186 if (d < 0x80) { c= d; trailing= 0; }
187 else if (d < 0xC0) return -2; /* trailing byte in leading position */
188 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
189 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
190 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
191 else return -2; /* no chance for this in UTF-16 */
192
193 for ( ; trailing; trailing--) {
194 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
195 c <<= 6;
196 c |= d & 0x3F;
197 }
198
199 /* assertion: c is a single UTF-4 value */
200 if (c < 0x10000) {
201 if (out >= outend) return -1;
202 *out++ = c;
203 }
204 else if (c < 0x110000) {
205 if (out+1 >= outend) return -1;
206 c -= 0x10000;
207 *out++ = 0xD800 | (c >> 10);
208 *out++ = 0xDC00 | (c & 0x03FF);
209 }
210 else return -1;
211 }
212 return out-outstart;
213}
214
Daniel Veillardb05deb71999-08-10 19:04:08 +0000215#endif /* ! HAVE_UNICODE_H */
Daniel Veillard97b58771998-10-20 06:14:16 +0000216
Daniel Veillard27d88741999-05-29 11:51:49 +0000217/**
218 * xmlDetectCharEncoding:
219 * @in: a pointer to the first bytes of the XML entity, must be at least
220 * 4 bytes long.
221 *
222 * Guess the encoding of the entity using the first bytes of the entity content
223 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
224 *
225 * Returns one of the XML_CHAR_ENCODING_... values.
226 */
227xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000228xmlDetectCharEncoding(const unsigned char* in)
Daniel Veillard27d88741999-05-29 11:51:49 +0000229{
230 if ((in[0] == 0x00) && (in[1] == 0x00) &&
231 (in[2] == 0x00) && (in[3] == 0x3C))
232 return(XML_CHAR_ENCODING_UCS4BE);
233 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
234 (in[2] == 0x00) && (in[3] == 0x00))
235 return(XML_CHAR_ENCODING_UCS4LE);
236 if ((in[0] == 0x00) && (in[1] == 0x00) &&
237 (in[2] == 0x3C) && (in[3] == 0x00))
238 return(XML_CHAR_ENCODING_UCS4_2143);
239 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
240 (in[2] == 0x00) && (in[3] == 0x00))
241 return(XML_CHAR_ENCODING_UCS4_3412);
242 if ((in[0] == 0xFE) && (in[1] == 0xFF))
243 return(XML_CHAR_ENCODING_UTF16BE);
244 if ((in[0] == 0xFF) && (in[1] == 0xFE))
245 return(XML_CHAR_ENCODING_UTF16LE);
246 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
247 (in[2] == 0xA7) && (in[3] == 0x94))
248 return(XML_CHAR_ENCODING_EBCDIC);
249 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
250 (in[2] == 0x78) && (in[3] == 0x6D))
251 return(XML_CHAR_ENCODING_UTF8);
252 return(XML_CHAR_ENCODING_NONE);
253}
254
255/**
256 * xmlParseCharEncoding:
257 * @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
258 *
259 * Conpare the string to the known encoding schemes already known. Note
260 * that the comparison is case insensitive accordingly to the section
261 * [XML] 4.3.3 Character Encoding in Entities.
262 *
263 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
264 * if not recognized.
265 */
266xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000267xmlParseCharEncoding(const char* name)
Daniel Veillard27d88741999-05-29 11:51:49 +0000268{
269 char upper[500];
270 int i;
271
272 for (i = 0;i < 499;i++) {
273 upper[i] = toupper(name[i]);
274 if (upper[i] == 0) break;
275 }
276 upper[i] = 0;
277
278 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
279 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
280 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
281
282 /*
283 * NOTE: if we were able to parse this, the endianness of UTF16 is
284 * already found and in use
285 */
286 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
287 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
288
289 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
290 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
291 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
292
293 /*
294 * NOTE: if we were able to parse this, the endianness of UCS4 is
295 * already found and in use
296 */
297 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
298 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
299 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
300
301
302 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
303 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
304 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
305
306 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
307 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
308 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
309
310 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
311 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
312 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
313 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
314 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
315 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
316 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
317
318 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
319 if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
320 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
321 return(XML_CHAR_ENCODING_ERROR);
322}
Daniel Veillard14fff061999-06-22 21:49:07 +0000323
324/****************************************************************
325 * *
326 * Char encoding handlers *
327 * *
328 ****************************************************************/
329
330/* the size should be growable, but it's not a big deal ... */
331#define MAX_ENCODING_HANDLERS 50
332static xmlCharEncodingHandlerPtr *handlers = NULL;
333static int nbCharEncodingHandler = 0;
334
335/*
336 * The default is UTF-8 for XML, that's also the default used for the
337 * parser internals, so the default encoding handler is NULL
338 */
339
340static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
341
342/**
343 * xmlNewCharEncodingHandler:
344 * @name: the encoding name, in UTF-8 format (ASCCI actually)
345 * @input: the xmlCharEncodingInputFunc to read that encoding
346 * @output: the xmlCharEncodingOutputFunc to write that encoding
347 *
348 * Create and registers an xmlCharEncodingHandler.
349 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
350 */
351xmlCharEncodingHandlerPtr
352xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input,
353 xmlCharEncodingOutputFunc output) {
354 xmlCharEncodingHandlerPtr handler;
355 char upper[500];
356 int i;
357 char *up = 0;
358
359 /*
360 * Keep only the uppercase version of the encoding.
361 */
362 if (name == NULL) {
363 fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
364 return(NULL);
365 }
366 for (i = 0;i < 499;i++) {
367 upper[i] = toupper(name[i]);
368 if (upper[i] == 0) break;
369 }
370 upper[i] = 0;
371 up = strdup(upper);
372 if (up == NULL) {
373 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
374 return(NULL);
375 }
376
377 /*
378 * allocate and fill-up an handler block.
379 */
380 handler = (xmlCharEncodingHandlerPtr)
381 malloc(sizeof(xmlCharEncodingHandler));
382 if (handler == NULL) {
383 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
384 return(NULL);
385 }
386 handler->input = input;
387 handler->output = output;
388 handler->name = up;
389
390 /*
391 * registers and returns the handler.
392 */
393 xmlRegisterCharEncodingHandler(handler);
394 return(handler);
395}
396
397/**
398 * xmlInitCharEncodingHandlers:
399 *
400 * Initialize the char encoding support, it registers the default
401 * encoding supported.
402 * NOTE: while public theis function usually don't need to be called
403 * in normal processing.
404 */
405void
406xmlInitCharEncodingHandlers(void) {
407 if (handlers != NULL) return;
408
409 handlers = (xmlCharEncodingHandlerPtr *)
410 malloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
411
412 if (handlers == NULL) {
413 fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
414 return;
415 }
416 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillardb05deb71999-08-10 19:04:08 +0000417#ifdef HAVE_UNICODE_H
418#else
Daniel Veillard14fff061999-06-22 21:49:07 +0000419 xmlNewCharEncodingHandler("UTF-16", UTF16ToUTF8, UTF8ToUTF16);
420 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
Daniel Veillardb05deb71999-08-10 19:04:08 +0000421#endif
Daniel Veillard14fff061999-06-22 21:49:07 +0000422}
423
424/**
425 * xmlRegisterCharEncodingHandler:
426 * @handler: the xmlCharEncodingHandlerPtr handler block
427 *
428 * Register the char encoding handler, surprizing, isn't it ?
429 */
430void
431xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
432 if (handlers == NULL) xmlInitCharEncodingHandlers();
433 if (handler == NULL) {
434 fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
435 return;
436 }
437
438 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
439 fprintf(stderr,
440 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
441 fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
442 return;
443 }
444 handlers[nbCharEncodingHandler++] = handler;
445}
446
447/**
448 * xmlGetCharEncodingHandler:
449 * @enc: an xmlCharEncoding value.
450 *
451 * Search in the registrered set the handler able to read/write that encoding.
452 *
453 * Returns the handler or NULL if not found
454 */
455xmlCharEncodingHandlerPtr
456xmlGetCharEncodingHandler(xmlCharEncoding enc) {
457 if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillarde2d034d1999-07-27 19:52:06 +0000458 /* TODO !!!!!!! */
Daniel Veillard14fff061999-06-22 21:49:07 +0000459 return(NULL);
460}
461
462/**
463 * xmlGetCharEncodingHandler:
464 * @enc: a string describing the char encoding.
465 *
466 * Search in the registrered set the handler able to read/write that encoding.
467 *
468 * Returns the handler or NULL if not found
469 */
470xmlCharEncodingHandlerPtr
471xmlFindCharEncodingHandler(const char *name) {
472 char upper[500];
473 int i;
474
475 if (handlers == NULL) xmlInitCharEncodingHandlers();
476 if (name == NULL) return(xmlDefaultCharEncodingHandler);
477 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
478
479 for (i = 0;i < 499;i++) {
480 upper[i] = toupper(name[i]);
481 if (upper[i] == 0) break;
482 }
483 upper[i] = 0;
484
485 for (i = 0;i < nbCharEncodingHandler; i++)
486 if (!strcmp(name, handlers[i]->name))
487 return(handlers[i]);
488
489 return(NULL);
490}
491