blob: 2184f61ac9d95f5a4dfb24bee11366214d01f48f [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
Daniel Veillard14fff061999-06-22 21:49:07 +000015 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard891e4041998-10-19 00:43:02 +000016 *
17 * See Copyright for the status of this software.
18 *
Daniel Veillard891e4041998-10-19 00:43:02 +000019 * Daniel.Veillard@w3.org
20 */
21
Daniel Veillard27d88741999-05-29 11:51:49 +000022#include <ctype.h>
Daniel Veillard14fff061999-06-22 21:49:07 +000023#include <string.h>
24#include <stdio.h>
Daniel Veillard891e4041998-10-19 00:43:02 +000025#include "encoding.h"
26
Daniel Veillard0ba4d531998-11-01 19:34:31 +000027/*
28 * From rfc2044: encoding of the Unicode values on UTF-8:
29 *
30 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
31 * 0000 0000-0000 007F 0xxxxxxx
32 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
33 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
34 *
35 * I hope we won't use values > 0xFFFF anytime soon !
36 */
37
Daniel Veillard97b58771998-10-20 06:14:16 +000038/**
39 * isolat1ToUTF8:
40 * @out: a pointer ot an array of bytes to store the result
41 * @outlen: the lenght of @out
42 * @in: a pointer ot an array of ISO Latin 1 chars
43 * @inlen: the lenght of @in
44 *
Daniel Veillard891e4041998-10-19 00:43:02 +000045 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
46 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +000047 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +000048 */
Daniel Veillard97b58771998-10-20 06:14:16 +000049int
50isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000051{
52 unsigned char* outstart= out;
53 unsigned char* outend= out+outlen;
54 unsigned char* inend= in+inlen;
55 unsigned char c;
56
57 while (in < inend) {
58 c= *in++;
59 if (c < 0x80) {
60 if (out >= outend) return -1;
61 *out++ = c;
62 }
63 else {
64 if (out >= outend) return -1;
65 *out++ = 0xC0 | (c >> 6);
66 if (out >= outend) return -1;
67 *out++ = 0x80 | (0x3F & c);
68 }
69 }
70 return out-outstart;
71}
72
Daniel Veillard97b58771998-10-20 06:14:16 +000073/**
74 * UTF8Toisolat1:
75 * @out: a pointer ot an array of bytes to store the result
76 * @outlen: the lenght of @out
77 * @in: a pointer ot an array of UTF-8 chars
78 * @inlen: the lenght of @in
79 *
Daniel Veillard891e4041998-10-19 00:43:02 +000080 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
81 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +000082 * TODO: need a fallback mechanism ...
Daniel Veillard1e346af1999-02-22 10:33:01 +000083 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +000084 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +000085 */
Daniel Veillard97b58771998-10-20 06:14:16 +000086int
87UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000088{
89 unsigned char* outstart= out;
90 unsigned char* outend= out+outlen;
91 unsigned char* inend= in+inlen;
Daniel Veillardccb09631998-10-27 06:21:04 +000092 unsigned char c;
Daniel Veillard891e4041998-10-19 00:43:02 +000093
94 while (in < inend) {
95 c= *in++;
96 if (c < 0x80) {
97 if (out >= outend) return -1;
98 *out++= c;
99 }
100 else if (((c & 0xFE) == 0xC2) && in<inend) {
101 if (out >= outend) return -1;
102 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
103 }
104 else return -2;
105 }
106 return out-outstart;
107}
108
Daniel Veillard97b58771998-10-20 06:14:16 +0000109/**
110 * UTF16ToUTF8:
111 * @out: a pointer ot an array of bytes to store the result
112 * @outlen: the lenght of @out
113 * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
114 * @inlen: the lenght of @in
115 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000116 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
117 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +0000118 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000119 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000120int
121UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000122{
123 unsigned char* outstart= out;
124 unsigned char* outend= out+outlen;
125 unsigned short* inend= in+inlen;
126 unsigned int c, d;
127 int bits;
128
129 while (in < inend) {
130 c= *in++;
131 if ((c & 0xFC00) == 0xD800) { /* surrogates */
132 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
133 c &= 0x03FF;
134 c <<= 10;
135 c |= d & 0x03FF;
136 c += 0x10000;
137 }
138 else return -1;
139 }
140
141 /* assertion: c is a single UTF-4 value */
142
143 if (out >= outend) return -1;
144 if (c < 0x80) { *out++= c; bits= -6; }
145 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
146 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
147 else { *out++= (c >> 18) | 0xF0; bits= 12; }
148
149 for ( ; bits < 0; bits-= 6) {
150 if (out >= outend) return -1;
151 *out++= (c >> bits) & 0x3F;
152 }
153 }
154 return out-outstart;
155}
156
Daniel Veillard97b58771998-10-20 06:14:16 +0000157/**
158 * UTF8ToUTF16:
159 * @out: a pointer ot an array of shorts to store the result
160 * @outlen: the lenght of @out (number of shorts)
161 * @in: a pointer ot an array of UTF-8 chars
162 * @inlen: the lenght of @in
163 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000164 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
165 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +0000166 * TODO: need a fallback mechanism ...
Daniel Veillard1e346af1999-02-22 10:33:01 +0000167 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +0000168 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000169 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000170int
171UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000172{
173 unsigned short* outstart= out;
174 unsigned short* outend= out+outlen;
175 unsigned char* inend= in+inlen;
176 unsigned int c, d, trailing;
177
178 while (in < inend) {
179 d= *in++;
180 if (d < 0x80) { c= d; trailing= 0; }
181 else if (d < 0xC0) return -2; /* trailing byte in leading position */
182 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
183 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
184 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
185 else return -2; /* no chance for this in UTF-16 */
186
187 for ( ; trailing; trailing--) {
188 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
189 c <<= 6;
190 c |= d & 0x3F;
191 }
192
193 /* assertion: c is a single UTF-4 value */
194 if (c < 0x10000) {
195 if (out >= outend) return -1;
196 *out++ = c;
197 }
198 else if (c < 0x110000) {
199 if (out+1 >= outend) return -1;
200 c -= 0x10000;
201 *out++ = 0xD800 | (c >> 10);
202 *out++ = 0xDC00 | (c & 0x03FF);
203 }
204 else return -1;
205 }
206 return out-outstart;
207}
208
Daniel Veillard97b58771998-10-20 06:14:16 +0000209
Daniel Veillard27d88741999-05-29 11:51:49 +0000210/**
211 * xmlDetectCharEncoding:
212 * @in: a pointer to the first bytes of the XML entity, must be at least
213 * 4 bytes long.
214 *
215 * Guess the encoding of the entity using the first bytes of the entity content
216 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
217 *
218 * Returns one of the XML_CHAR_ENCODING_... values.
219 */
220xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000221xmlDetectCharEncoding(const unsigned char* in)
Daniel Veillard27d88741999-05-29 11:51:49 +0000222{
223 if ((in[0] == 0x00) && (in[1] == 0x00) &&
224 (in[2] == 0x00) && (in[3] == 0x3C))
225 return(XML_CHAR_ENCODING_UCS4BE);
226 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
227 (in[2] == 0x00) && (in[3] == 0x00))
228 return(XML_CHAR_ENCODING_UCS4LE);
229 if ((in[0] == 0x00) && (in[1] == 0x00) &&
230 (in[2] == 0x3C) && (in[3] == 0x00))
231 return(XML_CHAR_ENCODING_UCS4_2143);
232 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
233 (in[2] == 0x00) && (in[3] == 0x00))
234 return(XML_CHAR_ENCODING_UCS4_3412);
235 if ((in[0] == 0xFE) && (in[1] == 0xFF))
236 return(XML_CHAR_ENCODING_UTF16BE);
237 if ((in[0] == 0xFF) && (in[1] == 0xFE))
238 return(XML_CHAR_ENCODING_UTF16LE);
239 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
240 (in[2] == 0xA7) && (in[3] == 0x94))
241 return(XML_CHAR_ENCODING_EBCDIC);
242 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
243 (in[2] == 0x78) && (in[3] == 0x6D))
244 return(XML_CHAR_ENCODING_UTF8);
245 return(XML_CHAR_ENCODING_NONE);
246}
247
248/**
249 * xmlParseCharEncoding:
250 * @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
251 *
252 * Conpare the string to the known encoding schemes already known. Note
253 * that the comparison is case insensitive accordingly to the section
254 * [XML] 4.3.3 Character Encoding in Entities.
255 *
256 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
257 * if not recognized.
258 */
259xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000260xmlParseCharEncoding(const char* name)
Daniel Veillard27d88741999-05-29 11:51:49 +0000261{
262 char upper[500];
263 int i;
264
265 for (i = 0;i < 499;i++) {
266 upper[i] = toupper(name[i]);
267 if (upper[i] == 0) break;
268 }
269 upper[i] = 0;
270
271 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
272 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
273 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
274
275 /*
276 * NOTE: if we were able to parse this, the endianness of UTF16 is
277 * already found and in use
278 */
279 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
280 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
281
282 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
283 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
284 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
285
286 /*
287 * NOTE: if we were able to parse this, the endianness of UCS4 is
288 * already found and in use
289 */
290 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
291 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
292 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
293
294
295 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
296 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
297 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
298
299 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
300 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
301 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
302
303 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
304 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
305 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
306 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
307 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
308 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
309 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
310
311 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
312 if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
313 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
314 return(XML_CHAR_ENCODING_ERROR);
315}
Daniel Veillard14fff061999-06-22 21:49:07 +0000316
317/****************************************************************
318 * *
319 * Char encoding handlers *
320 * *
321 ****************************************************************/
322
323/* the size should be growable, but it's not a big deal ... */
324#define MAX_ENCODING_HANDLERS 50
325static xmlCharEncodingHandlerPtr *handlers = NULL;
326static int nbCharEncodingHandler = 0;
327
328/*
329 * The default is UTF-8 for XML, that's also the default used for the
330 * parser internals, so the default encoding handler is NULL
331 */
332
333static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
334
335/**
336 * xmlNewCharEncodingHandler:
337 * @name: the encoding name, in UTF-8 format (ASCCI actually)
338 * @input: the xmlCharEncodingInputFunc to read that encoding
339 * @output: the xmlCharEncodingOutputFunc to write that encoding
340 *
341 * Create and registers an xmlCharEncodingHandler.
342 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
343 */
344xmlCharEncodingHandlerPtr
345xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input,
346 xmlCharEncodingOutputFunc output) {
347 xmlCharEncodingHandlerPtr handler;
348 char upper[500];
349 int i;
350 char *up = 0;
351
352 /*
353 * Keep only the uppercase version of the encoding.
354 */
355 if (name == NULL) {
356 fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
357 return(NULL);
358 }
359 for (i = 0;i < 499;i++) {
360 upper[i] = toupper(name[i]);
361 if (upper[i] == 0) break;
362 }
363 upper[i] = 0;
364 up = strdup(upper);
365 if (up == NULL) {
366 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
367 return(NULL);
368 }
369
370 /*
371 * allocate and fill-up an handler block.
372 */
373 handler = (xmlCharEncodingHandlerPtr)
374 malloc(sizeof(xmlCharEncodingHandler));
375 if (handler == NULL) {
376 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
377 return(NULL);
378 }
379 handler->input = input;
380 handler->output = output;
381 handler->name = up;
382
383 /*
384 * registers and returns the handler.
385 */
386 xmlRegisterCharEncodingHandler(handler);
387 return(handler);
388}
389
390/**
391 * xmlInitCharEncodingHandlers:
392 *
393 * Initialize the char encoding support, it registers the default
394 * encoding supported.
395 * NOTE: while public theis function usually don't need to be called
396 * in normal processing.
397 */
398void
399xmlInitCharEncodingHandlers(void) {
400 if (handlers != NULL) return;
401
402 handlers = (xmlCharEncodingHandlerPtr *)
403 malloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
404
405 if (handlers == NULL) {
406 fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
407 return;
408 }
409 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
410 xmlNewCharEncodingHandler("UTF-16", UTF16ToUTF8, UTF8ToUTF16);
411 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
412}
413
414/**
415 * xmlRegisterCharEncodingHandler:
416 * @handler: the xmlCharEncodingHandlerPtr handler block
417 *
418 * Register the char encoding handler, surprizing, isn't it ?
419 */
420void
421xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
422 if (handlers == NULL) xmlInitCharEncodingHandlers();
423 if (handler == NULL) {
424 fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
425 return;
426 }
427
428 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
429 fprintf(stderr,
430 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
431 fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
432 return;
433 }
434 handlers[nbCharEncodingHandler++] = handler;
435}
436
437/**
438 * xmlGetCharEncodingHandler:
439 * @enc: an xmlCharEncoding value.
440 *
441 * Search in the registrered set the handler able to read/write that encoding.
442 *
443 * Returns the handler or NULL if not found
444 */
445xmlCharEncodingHandlerPtr
446xmlGetCharEncodingHandler(xmlCharEncoding enc) {
447 if (handlers == NULL) xmlInitCharEncodingHandlers();
448 return(NULL);
449}
450
451/**
452 * xmlGetCharEncodingHandler:
453 * @enc: a string describing the char encoding.
454 *
455 * Search in the registrered set the handler able to read/write that encoding.
456 *
457 * Returns the handler or NULL if not found
458 */
459xmlCharEncodingHandlerPtr
460xmlFindCharEncodingHandler(const char *name) {
461 char upper[500];
462 int i;
463
464 if (handlers == NULL) xmlInitCharEncodingHandlers();
465 if (name == NULL) return(xmlDefaultCharEncodingHandler);
466 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
467
468 for (i = 0;i < 499;i++) {
469 upper[i] = toupper(name[i]);
470 if (upper[i] == 0) break;
471 }
472 upper[i] = 0;
473
474 for (i = 0;i < nbCharEncodingHandler; i++)
475 if (!strcmp(name, handlers[i]->name))
476 return(handlers[i]);
477
478 return(NULL);
479}
480