blob: 7d1a97164f80cd239b1e6708c730ed2b1bf885c7 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
Daniel Veillardc5d64342001-06-24 12:13:24 +000018 * daniel@veillard.com
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Daniel Veillard34ce8be2002-03-18 19:37:11 +000026#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000027#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000028
Owen Taylor3473f882001-02-23 17:55:21 +000029#include <string.h>
30
31#ifdef HAVE_CTYPE_H
32#include <ctype.h>
33#endif
34#ifdef HAVE_STDLIB_H
35#include <stdlib.h>
36#endif
Owen Taylor3473f882001-02-23 17:55:21 +000037#ifdef LIBXML_ICONV_ENABLED
38#ifdef HAVE_ERRNO_H
39#include <errno.h>
40#endif
41#endif
42#include <libxml/encoding.h>
43#include <libxml/xmlmemory.h>
44#ifdef LIBXML_HTML_ENABLED
45#include <libxml/HTMLparser.h>
46#endif
Daniel Veillard64a411c2001-10-15 12:32:07 +000047#include <libxml/globals.h>
Daniel Veillarda4617b82001-11-04 20:19:12 +000048#include <libxml/xmlerror.h>
Owen Taylor3473f882001-02-23 17:55:21 +000049
Daniel Veillard22090732001-07-16 00:06:07 +000050static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +000052
53typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
54typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
55struct _xmlCharEncodingAlias {
56 const char *name;
57 const char *alias;
58};
59
60static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
61static int xmlCharEncodingAliasesNb = 0;
62static int xmlCharEncodingAliasesMax = 0;
63
64#ifdef LIBXML_ICONV_ENABLED
65#if 0
66#define DEBUG_ENCODING /* Define this to get encoding traces */
67#endif
68#endif
69
70static int xmlLittleEndian = 1;
71
Daniel Veillard97ac1312001-05-30 19:14:17 +000072/************************************************************************
73 * *
74 * Generic UTF8 handling routines *
75 * *
76 * From rfc2044: encoding of the Unicode values on UTF-8: *
77 * *
78 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
79 * 0000 0000-0000 007F 0xxxxxxx *
80 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
81 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
82 * *
83 * I hope we won't use values > 0xFFFF anytime soon ! *
84 * *
85 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000086
87/**
William M. Brack4a557d92003-07-29 04:28:04 +000088 * xmlUTF8Size:
89 * @utf: pointer to the UTF8 character
90 *
91 * returns the numbers of bytes in the character, -1 on format error
92 */
93int
94xmlUTF8Size(const xmlChar *utf) {
95 xmlChar mask;
96 int len;
97
98 if (utf == NULL)
99 return -1;
100 if (*utf < 0x80)
101 return 1;
102 /* check valid UTF8 character */
103 if (!(*utf & 0x40))
104 return -1;
105 /* determine number of bytes in char */
106 len = 2;
107 for (mask=0x20; mask != 0; mask>>=1) {
108 if (!(*utf & mask))
109 return len;
110 len++;
111 }
112 return -1;
113}
114
115/**
116 * xmlUTF8Charcmp
117 * @utf1: pointer to first UTF8 char
118 * @utf2: pointer to second UTF8 char
119 *
120 * returns result of comparing the two UCS4 values
121 * as with xmlStrncmp
122 */
123int
124xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
125
126 if (utf1 == NULL ) {
127 if (utf2 == NULL)
128 return 0;
129 return -1;
130 }
Daniel Veillard9ff7de12003-07-29 13:30:42 +0000131 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
William M. Brack4a557d92003-07-29 04:28:04 +0000132}
133
134/**
Daniel Veillarde043ee12001-04-16 14:08:07 +0000135 * xmlUTF8Strlen:
136 * @utf: a sequence of UTF-8 encoded bytes
137 *
Daniel Veillard60087f32001-10-10 09:45:09 +0000138 * compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillarde043ee12001-04-16 14:08:07 +0000139 * checking of the content of the string.
140 *
141 * Returns the number of characters in the string or -1 in case of error
142 */
143int
Daniel Veillard97ac1312001-05-30 19:14:17 +0000144xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +0000145 int ret = 0;
146
147 if (utf == NULL)
148 return(-1);
149
150 while (*utf != 0) {
151 if (utf[0] & 0x80) {
152 if ((utf[1] & 0xc0) != 0x80)
153 return(-1);
154 if ((utf[0] & 0xe0) == 0xe0) {
155 if ((utf[2] & 0xc0) != 0x80)
156 return(-1);
157 if ((utf[0] & 0xf0) == 0xf0) {
158 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
159 return(-1);
160 utf += 4;
161 } else {
162 utf += 3;
163 }
164 } else {
165 utf += 2;
166 }
167 } else {
168 utf++;
169 }
170 ret++;
171 }
172 return(ret);
173}
174
175/**
Owen Taylor3473f882001-02-23 17:55:21 +0000176 * xmlGetUTF8Char:
177 * @utf: a sequence of UTF-8 encoded bytes
178 * @len: a pointer to @bytes len
179 *
180 * Read one UTF8 Char from @utf
181 *
182 * Returns the char value or -1 in case of error and update @len with the
183 * number of bytes used
184 */
Daniel Veillardf000f072002-10-22 14:28:17 +0000185int
Owen Taylor3473f882001-02-23 17:55:21 +0000186xmlGetUTF8Char(const unsigned char *utf, int *len) {
187 unsigned int c;
188
189 if (utf == NULL)
190 goto error;
191 if (len == NULL)
192 goto error;
193 if (*len < 1)
194 goto error;
195
196 c = utf[0];
197 if (c & 0x80) {
198 if (*len < 2)
199 goto error;
200 if ((utf[1] & 0xc0) != 0x80)
201 goto error;
202 if ((c & 0xe0) == 0xe0) {
203 if (*len < 3)
204 goto error;
205 if ((utf[2] & 0xc0) != 0x80)
206 goto error;
207 if ((c & 0xf0) == 0xf0) {
208 if (*len < 4)
209 goto error;
210 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
211 goto error;
212 *len = 4;
213 /* 4-byte code */
214 c = (utf[0] & 0x7) << 18;
215 c |= (utf[1] & 0x3f) << 12;
216 c |= (utf[2] & 0x3f) << 6;
217 c |= utf[3] & 0x3f;
218 } else {
219 /* 3-byte code */
220 *len = 3;
221 c = (utf[0] & 0xf) << 12;
222 c |= (utf[1] & 0x3f) << 6;
223 c |= utf[2] & 0x3f;
224 }
225 } else {
226 /* 2-byte code */
227 *len = 2;
228 c = (utf[0] & 0x1f) << 6;
229 c |= utf[1] & 0x3f;
230 }
231 } else {
232 /* 1-byte code */
233 *len = 1;
234 }
235 return(c);
236
237error:
238 *len = 0;
239 return(-1);
240}
241
242/**
Daniel Veillard01c13b52002-12-10 15:19:08 +0000243 * xmlCheckUTF8:
Owen Taylor3473f882001-02-23 17:55:21 +0000244 * @utf: Pointer to putative utf-8 encoded string.
245 *
246 * Checks @utf for being valid utf-8. @utf is assumed to be
247 * null-terminated. This function is not super-strict, as it will
248 * allow longer utf-8 sequences than necessary. Note that Java is
249 * capable of producing these sequences if provoked. Also note, this
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000250 * routine checks for the 4-byte maximum size, but does not check for
Owen Taylor3473f882001-02-23 17:55:21 +0000251 * 0x10ffff maximum value.
252 *
253 * Return value: true if @utf is valid.
254 **/
255int
256xmlCheckUTF8(const unsigned char *utf)
257{
258 int ix;
259 unsigned char c;
260
261 for (ix = 0; (c = utf[ix]);) {
262 if (c & 0x80) {
263 if ((utf[ix + 1] & 0xc0) != 0x80)
264 return(0);
265 if ((c & 0xe0) == 0xe0) {
266 if ((utf[ix + 2] & 0xc0) != 0x80)
267 return(0);
268 if ((c & 0xf0) == 0xf0) {
269 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
270 return(0);
271 ix += 4;
272 /* 4-byte code */
273 } else
274 /* 3-byte code */
275 ix += 3;
276 } else
277 /* 2-byte code */
278 ix += 2;
279 } else
280 /* 1-byte code */
281 ix++;
282 }
283 return(1);
284}
285
286/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000287 * xmlUTF8Strsize:
288 * @utf: a sequence of UTF-8 encoded bytes
289 * @len: the number of characters in the array
290 *
291 * storage size of an UTF8 string
292 *
293 * Returns the storage size of
294 * the first 'len' characters of ARRAY
295 *
296 */
297
298int
299xmlUTF8Strsize(const xmlChar *utf, int len) {
300 const xmlChar *ptr=utf;
301 xmlChar ch;
302
303 if (len <= 0)
304 return(0);
305
306 while ( len-- > 0) {
307 if ( !*ptr )
308 break;
309 if ( (ch = *ptr++) & 0x80)
310 while ( (ch<<=1) & 0x80 )
311 ptr++;
312 }
313 return (ptr - utf);
314}
315
316
317/**
318 * xmlUTF8Strndup:
319 * @utf: the input UTF8 *
320 * @len: the len of @utf (in chars)
321 *
322 * a strndup for array of UTF8's
323 *
324 * Returns a new UTF8 * or NULL
325 */
326xmlChar *
327xmlUTF8Strndup(const xmlChar *utf, int len) {
328 xmlChar *ret;
329 int i;
330
331 if ((utf == NULL) || (len < 0)) return(NULL);
332 i = xmlUTF8Strsize(utf, len);
Daniel Veillard3c908dc2003-04-19 00:07:51 +0000333 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
Daniel Veillard97ac1312001-05-30 19:14:17 +0000334 if (ret == NULL) {
335 xmlGenericError(xmlGenericErrorContext,
336 "malloc of %ld byte failed\n",
337 (len + 1) * (long)sizeof(xmlChar));
338 return(NULL);
339 }
340 memcpy(ret, utf, i * sizeof(xmlChar));
341 ret[i] = 0;
342 return(ret);
343}
344
345/**
346 * xmlUTF8Strpos:
347 * @utf: the input UTF8 *
348 * @pos: the position of the desired UTF8 char (in chars)
349 *
350 * a function to provide the equivalent of fetching a
351 * character from a string array
352 *
353 * Returns a pointer to the UTF8 character or NULL
354 */
355xmlChar *
356xmlUTF8Strpos(const xmlChar *utf, int pos) {
357 xmlChar ch;
358
359 if (utf == NULL) return(NULL);
360 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
361 return(NULL);
362 while (pos--) {
363 if ((ch=*utf++) == 0) return(NULL);
364 if ( ch & 0x80 ) {
365 /* if not simple ascii, verify proper format */
366 if ( (ch & 0xc0) != 0xc0 )
367 return(NULL);
368 /* then skip over remaining bytes for this char */
369 while ( (ch <<= 1) & 0x80 )
370 if ( (*utf++ & 0xc0) != 0x80 )
371 return(NULL);
372 }
373 }
374 return((xmlChar *)utf);
375}
376
377/**
378 * xmlUTF8Strloc:
379 * @utf: the input UTF8 *
380 * @utfchar: the UTF8 character to be found
381 *
382 * a function to provide relative location of a UTF8 char
383 *
384 * Returns the relative character position of the desired char
385 * or -1 if not found
386 */
387int
388xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
389 int i, size;
390 xmlChar ch;
391
392 if (utf==NULL || utfchar==NULL) return -1;
393 size = xmlUTF8Strsize(utfchar, 1);
394 for(i=0; (ch=*utf) != 0; i++) {
395 if (xmlStrncmp(utf, utfchar, size)==0)
396 return(i);
397 utf++;
398 if ( ch & 0x80 ) {
399 /* if not simple ascii, verify proper format */
400 if ( (ch & 0xc0) != 0xc0 )
401 return(-1);
402 /* then skip over remaining bytes for this char */
403 while ( (ch <<= 1) & 0x80 )
404 if ( (*utf++ & 0xc0) != 0x80 )
405 return(-1);
406 }
407 }
408
409 return(-1);
410}
411/**
412 * xmlUTF8Strsub:
413 * @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard97ac1312001-05-30 19:14:17 +0000414 * @start: relative pos of first char
415 * @len: total number to copy
416 *
417 * Note: positions are given in units of UTF-8 chars
418 *
419 * Returns a pointer to a newly created string
420 * or NULL if any problem
421 */
422
423xmlChar *
424xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
425 int i;
426 xmlChar ch;
427
428 if (utf == NULL) return(NULL);
429 if (start < 0) return(NULL);
430 if (len < 0) return(NULL);
431
432 /*
433 * Skip over any leading chars
434 */
435 for (i = 0;i < start;i++) {
436 if ((ch=*utf++) == 0) return(NULL);
437 if ( ch & 0x80 ) {
438 /* if not simple ascii, verify proper format */
439 if ( (ch & 0xc0) != 0xc0 )
440 return(NULL);
441 /* then skip over remaining bytes for this char */
442 while ( (ch <<= 1) & 0x80 )
443 if ( (*utf++ & 0xc0) != 0x80 )
444 return(NULL);
445 }
446 }
447
448 return(xmlUTF8Strndup(utf, len));
449}
450
451/************************************************************************
452 * *
453 * Conversions To/From UTF8 encoding *
454 * *
455 ************************************************************************/
456
457/**
Owen Taylor3473f882001-02-23 17:55:21 +0000458 * asciiToUTF8:
459 * @out: a pointer to an array of bytes to store the result
460 * @outlen: the length of @out
461 * @in: a pointer to an array of ASCII chars
462 * @inlen: the length of @in
463 *
464 * Take a block of ASCII chars in and try to convert it to an UTF-8
465 * block of chars out.
466 * Returns 0 if success, or -1 otherwise
467 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000468 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000469 * The value of @outlen after return is the number of ocetes consumed.
470 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000471static int
Owen Taylor3473f882001-02-23 17:55:21 +0000472asciiToUTF8(unsigned char* out, int *outlen,
473 const unsigned char* in, int *inlen) {
474 unsigned char* outstart = out;
475 const unsigned char* base = in;
476 const unsigned char* processed = in;
477 unsigned char* outend = out + *outlen;
478 const unsigned char* inend;
479 unsigned int c;
480 int bits;
481
482 inend = in + (*inlen);
483 while ((in < inend) && (out - outstart + 5 < *outlen)) {
484 c= *in++;
485
486 /* assertion: c is a single UTF-4 value */
487 if (out >= outend)
488 break;
489 if (c < 0x80) { *out++= c; bits= -6; }
490 else {
491 *outlen = out - outstart;
492 *inlen = processed - base;
493 return(-1);
494 }
495
496 for ( ; bits >= 0; bits-= 6) {
497 if (out >= outend)
498 break;
499 *out++= ((c >> bits) & 0x3F) | 0x80;
500 }
501 processed = (const unsigned char*) in;
502 }
503 *outlen = out - outstart;
504 *inlen = processed - base;
505 return(0);
506}
507
508/**
509 * UTF8Toascii:
510 * @out: a pointer to an array of bytes to store the result
511 * @outlen: the length of @out
512 * @in: a pointer to an array of UTF-8 chars
513 * @inlen: the length of @in
514 *
515 * Take a block of UTF-8 chars in and try to convert it to an ASCII
516 * block of chars out.
517 *
518 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
519 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000520 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000521 * The value of @outlen after return is the number of ocetes consumed.
522 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000523static int
Owen Taylor3473f882001-02-23 17:55:21 +0000524UTF8Toascii(unsigned char* out, int *outlen,
525 const unsigned char* in, int *inlen) {
526 const unsigned char* processed = in;
527 const unsigned char* outend;
528 const unsigned char* outstart = out;
529 const unsigned char* instart = in;
530 const unsigned char* inend;
531 unsigned int c, d;
532 int trailing;
533
534 if (in == NULL) {
535 /*
536 * initialization nothing to do
537 */
538 *outlen = 0;
539 *inlen = 0;
540 return(0);
541 }
542 inend = in + (*inlen);
543 outend = out + (*outlen);
544 while (in < inend) {
545 d = *in++;
546 if (d < 0x80) { c= d; trailing= 0; }
547 else if (d < 0xC0) {
548 /* trailing byte in leading position */
549 *outlen = out - outstart;
550 *inlen = processed - instart;
551 return(-2);
552 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
553 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
554 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
555 else {
556 /* no chance for this in Ascii */
557 *outlen = out - outstart;
558 *inlen = processed - instart;
559 return(-2);
560 }
561
562 if (inend - in < trailing) {
563 break;
564 }
565
566 for ( ; trailing; trailing--) {
567 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
568 break;
569 c <<= 6;
570 c |= d & 0x3F;
571 }
572
573 /* assertion: c is a single UTF-4 value */
574 if (c < 0x80) {
575 if (out >= outend)
576 break;
577 *out++ = c;
578 } else {
579 /* no chance for this in Ascii */
580 *outlen = out - outstart;
581 *inlen = processed - instart;
582 return(-2);
583 }
584 processed = in;
585 }
586 *outlen = out - outstart;
587 *inlen = processed - instart;
588 return(0);
589}
590
591/**
592 * isolat1ToUTF8:
593 * @out: a pointer to an array of bytes to store the result
594 * @outlen: the length of @out
595 * @in: a pointer to an array of ISO Latin 1 chars
596 * @inlen: the length of @in
597 *
598 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
599 * block of chars out.
600 * Returns 0 if success, or -1 otherwise
601 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000602 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000603 * The value of @outlen after return is the number of ocetes consumed.
604 */
605int
606isolat1ToUTF8(unsigned char* out, int *outlen,
607 const unsigned char* in, int *inlen) {
608 unsigned char* outstart = out;
609 const unsigned char* base = in;
Owen Taylor3473f882001-02-23 17:55:21 +0000610 unsigned char* outend = out + *outlen;
611 const unsigned char* inend;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000612 const unsigned char* instop;
613 xmlChar c = *in;
Owen Taylor3473f882001-02-23 17:55:21 +0000614
615 inend = in + (*inlen);
Daniel Veillarde72c7562002-05-31 09:47:30 +0000616 instop = inend;
617
618 while (in < inend && out < outend - 1) {
619 if (c >= 0x80) {
Daniel Veillarddb552912002-03-21 13:27:59 +0000620 *out++= ((c >> 6) & 0x1F) | 0xC0;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000621 *out++= (c & 0x3F) | 0x80;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000622 ++in;
623 c = *in;
624 }
625 if (instop - in > outend - out) instop = in + (outend - out);
626 while (c < 0x80 && in < instop) {
627 *out++ = c;
628 ++in;
629 c = *in;
630 }
631 }
632 if (in < inend && out < outend && c < 0x80) {
633 *out++ = c;
634 ++in;
Owen Taylor3473f882001-02-23 17:55:21 +0000635 }
636 *outlen = out - outstart;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000637 *inlen = in - base;
Owen Taylor3473f882001-02-23 17:55:21 +0000638 return(0);
639}
640
Daniel Veillard81601f92003-01-14 13:42:37 +0000641/**
642 * UTF8ToUTF8:
643 * @out: a pointer to an array of bytes to store the result
644 * @outlen: the length of @out
645 * @inb: a pointer to an array of UTF-8 chars
646 * @inlenb: the length of @in in UTF-8 chars
647 *
648 * No op copy operation for UTF8 handling.
649 *
650 * Returns the number of byte written, or -1 by lack of space, or -2
651 * if the transcoding fails (for *in is not valid utf16 string)
652 * The value of *inlen after return is the number of octets consumed
653 * as the return value is positive, else unpredictable.
654 */
655static int
656UTF8ToUTF8(unsigned char* out, int *outlen,
657 const unsigned char* inb, int *inlenb)
658{
659 int len;
660
661 if ((out == NULL) || (inb == NULL) || (outlen == NULL) || (inlenb == NULL))
662 return(-1);
663 if (*outlen > *inlenb) {
664 len = *inlenb;
665 } else {
666 len = *outlen;
667 }
668 if (len < 0)
669 return(-1);
670
671 memcpy(out, inb, len);
672
673 *outlen = len;
674 *inlenb = len;
675 return(0);
676}
677
Daniel Veillarde72c7562002-05-31 09:47:30 +0000678
Owen Taylor3473f882001-02-23 17:55:21 +0000679/**
680 * UTF8Toisolat1:
681 * @out: a pointer to an array of bytes to store the result
682 * @outlen: the length of @out
683 * @in: a pointer to an array of UTF-8 chars
684 * @inlen: the length of @in
685 *
686 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
687 * block of chars out.
688 *
689 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
690 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000691 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000692 * The value of @outlen after return is the number of ocetes consumed.
693 */
694int
695UTF8Toisolat1(unsigned char* out, int *outlen,
696 const unsigned char* in, int *inlen) {
697 const unsigned char* processed = in;
698 const unsigned char* outend;
699 const unsigned char* outstart = out;
700 const unsigned char* instart = in;
701 const unsigned char* inend;
702 unsigned int c, d;
703 int trailing;
704
705 if (in == NULL) {
706 /*
707 * initialization nothing to do
708 */
709 *outlen = 0;
710 *inlen = 0;
711 return(0);
712 }
713 inend = in + (*inlen);
714 outend = out + (*outlen);
715 while (in < inend) {
716 d = *in++;
717 if (d < 0x80) { c= d; trailing= 0; }
718 else if (d < 0xC0) {
719 /* trailing byte in leading position */
720 *outlen = out - outstart;
721 *inlen = processed - instart;
722 return(-2);
723 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
724 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
725 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
726 else {
727 /* no chance for this in IsoLat1 */
728 *outlen = out - outstart;
729 *inlen = processed - instart;
730 return(-2);
731 }
732
733 if (inend - in < trailing) {
734 break;
735 }
736
737 for ( ; trailing; trailing--) {
738 if (in >= inend)
739 break;
740 if (((d= *in++) & 0xC0) != 0x80) {
741 *outlen = out - outstart;
742 *inlen = processed - instart;
743 return(-2);
744 }
745 c <<= 6;
746 c |= d & 0x3F;
747 }
748
749 /* assertion: c is a single UTF-4 value */
750 if (c <= 0xFF) {
751 if (out >= outend)
752 break;
753 *out++ = c;
754 } else {
755 /* no chance for this in IsoLat1 */
756 *outlen = out - outstart;
757 *inlen = processed - instart;
758 return(-2);
759 }
760 processed = in;
761 }
762 *outlen = out - outstart;
763 *inlen = processed - instart;
764 return(0);
765}
766
767/**
768 * UTF16LEToUTF8:
769 * @out: a pointer to an array of bytes to store the result
770 * @outlen: the length of @out
771 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
772 * @inlenb: the length of @in in UTF-16LE chars
773 *
774 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000775 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000776 * is the same between the native type of this machine and the
777 * inputed one.
778 *
779 * Returns the number of byte written, or -1 by lack of space, or -2
780 * if the transcoding fails (for *in is not valid utf16 string)
781 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000782 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000783 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000784static int
Owen Taylor3473f882001-02-23 17:55:21 +0000785UTF16LEToUTF8(unsigned char* out, int *outlen,
786 const unsigned char* inb, int *inlenb)
787{
788 unsigned char* outstart = out;
789 const unsigned char* processed = inb;
790 unsigned char* outend = out + *outlen;
791 unsigned short* in = (unsigned short*) inb;
792 unsigned short* inend;
793 unsigned int c, d, inlen;
794 unsigned char *tmp;
795 int bits;
796
797 if ((*inlenb % 2) == 1)
798 (*inlenb)--;
799 inlen = *inlenb / 2;
800 inend = in + inlen;
801 while ((in < inend) && (out - outstart + 5 < *outlen)) {
802 if (xmlLittleEndian) {
803 c= *in++;
804 } else {
805 tmp = (unsigned char *) in;
806 c = *tmp++;
807 c = c | (((unsigned int)*tmp) << 8);
808 in++;
809 }
810 if ((c & 0xFC00) == 0xD800) { /* surrogates */
811 if (in >= inend) { /* (in > inend) shouldn't happens */
812 break;
813 }
814 if (xmlLittleEndian) {
815 d = *in++;
816 } else {
817 tmp = (unsigned char *) in;
818 d = *tmp++;
819 d = d | (((unsigned int)*tmp) << 8);
820 in++;
821 }
822 if ((d & 0xFC00) == 0xDC00) {
823 c &= 0x03FF;
824 c <<= 10;
825 c |= d & 0x03FF;
826 c += 0x10000;
827 }
828 else {
829 *outlen = out - outstart;
830 *inlenb = processed - inb;
831 return(-2);
832 }
833 }
834
835 /* assertion: c is a single UTF-4 value */
836 if (out >= outend)
837 break;
838 if (c < 0x80) { *out++= c; bits= -6; }
839 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
840 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
841 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
842
843 for ( ; bits >= 0; bits-= 6) {
844 if (out >= outend)
845 break;
846 *out++= ((c >> bits) & 0x3F) | 0x80;
847 }
848 processed = (const unsigned char*) in;
849 }
850 *outlen = out - outstart;
851 *inlenb = processed - inb;
852 return(0);
853}
854
855/**
856 * UTF8ToUTF16LE:
857 * @outb: a pointer to an array of bytes to store the result
858 * @outlen: the length of @outb
859 * @in: a pointer to an array of UTF-8 chars
860 * @inlen: the length of @in
861 *
862 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
863 * block of chars out.
864 *
865 * Returns the number of byte written, or -1 by lack of space, or -2
866 * if the transcoding failed.
867 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000868static int
Owen Taylor3473f882001-02-23 17:55:21 +0000869UTF8ToUTF16LE(unsigned char* outb, int *outlen,
870 const unsigned char* in, int *inlen)
871{
872 unsigned short* out = (unsigned short*) outb;
873 const unsigned char* processed = in;
874 unsigned short* outstart= out;
875 unsigned short* outend;
876 const unsigned char* inend= in+*inlen;
877 unsigned int c, d;
878 int trailing;
879 unsigned char *tmp;
880 unsigned short tmp1, tmp2;
881
882 if (in == NULL) {
883 /*
884 * initialization, add the Byte Order Mark
885 */
886 if (*outlen >= 2) {
887 outb[0] = 0xFF;
888 outb[1] = 0xFE;
889 *outlen = 2;
890 *inlen = 0;
891#ifdef DEBUG_ENCODING
892 xmlGenericError(xmlGenericErrorContext,
893 "Added FFFE Byte Order Mark\n");
894#endif
895 return(2);
896 }
897 *outlen = 0;
898 *inlen = 0;
899 return(0);
900 }
901 outend = out + (*outlen / 2);
902 while (in < inend) {
903 d= *in++;
904 if (d < 0x80) { c= d; trailing= 0; }
905 else if (d < 0xC0) {
906 /* trailing byte in leading position */
907 *outlen = (out - outstart) * 2;
908 *inlen = processed - in;
909 return(-2);
910 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
911 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
912 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
913 else {
914 /* no chance for this in UTF-16 */
915 *outlen = (out - outstart) * 2;
916 *inlen = processed - in;
917 return(-2);
918 }
919
920 if (inend - in < trailing) {
921 break;
922 }
923
924 for ( ; trailing; trailing--) {
925 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
926 break;
927 c <<= 6;
928 c |= d & 0x3F;
929 }
930
931 /* assertion: c is a single UTF-4 value */
932 if (c < 0x10000) {
933 if (out >= outend)
934 break;
935 if (xmlLittleEndian) {
936 *out++ = c;
937 } else {
938 tmp = (unsigned char *) out;
939 *tmp = c ;
940 *(tmp + 1) = c >> 8 ;
941 out++;
942 }
943 }
944 else if (c < 0x110000) {
945 if (out+1 >= outend)
946 break;
947 c -= 0x10000;
948 if (xmlLittleEndian) {
949 *out++ = 0xD800 | (c >> 10);
950 *out++ = 0xDC00 | (c & 0x03FF);
951 } else {
952 tmp1 = 0xD800 | (c >> 10);
953 tmp = (unsigned char *) out;
954 *tmp = (unsigned char) tmp1;
955 *(tmp + 1) = tmp1 >> 8;
956 out++;
957
958 tmp2 = 0xDC00 | (c & 0x03FF);
959 tmp = (unsigned char *) out;
960 *tmp = (unsigned char) tmp2;
961 *(tmp + 1) = tmp2 >> 8;
962 out++;
963 }
964 }
965 else
966 break;
967 processed = in;
968 }
969 *outlen = (out - outstart) * 2;
970 *inlen = processed - in;
971 return(0);
972}
973
974/**
975 * UTF16BEToUTF8:
976 * @out: a pointer to an array of bytes to store the result
977 * @outlen: the length of @out
978 * @inb: a pointer to an array of UTF-16 passwd as a byte array
979 * @inlenb: the length of @in in UTF-16 chars
980 *
981 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000982 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000983 * is the same between the native type of this machine and the
984 * inputed one.
985 *
986 * Returns the number of byte written, or -1 by lack of space, or -2
987 * if the transcoding fails (for *in is not valid utf16 string)
988 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000989 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000990 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000991static int
Owen Taylor3473f882001-02-23 17:55:21 +0000992UTF16BEToUTF8(unsigned char* out, int *outlen,
993 const unsigned char* inb, int *inlenb)
994{
995 unsigned char* outstart = out;
996 const unsigned char* processed = inb;
997 unsigned char* outend = out + *outlen;
998 unsigned short* in = (unsigned short*) inb;
999 unsigned short* inend;
1000 unsigned int c, d, inlen;
1001 unsigned char *tmp;
1002 int bits;
1003
1004 if ((*inlenb % 2) == 1)
1005 (*inlenb)--;
1006 inlen = *inlenb / 2;
1007 inend= in + inlen;
1008 while (in < inend) {
1009 if (xmlLittleEndian) {
1010 tmp = (unsigned char *) in;
1011 c = *tmp++;
1012 c = c << 8;
1013 c = c | (unsigned int) *tmp;
1014 in++;
1015 } else {
1016 c= *in++;
1017 }
1018 if ((c & 0xFC00) == 0xD800) { /* surrogates */
1019 if (in >= inend) { /* (in > inend) shouldn't happens */
1020 *outlen = out - outstart;
1021 *inlenb = processed - inb;
1022 return(-2);
1023 }
1024 if (xmlLittleEndian) {
1025 tmp = (unsigned char *) in;
1026 d = *tmp++;
1027 d = d << 8;
1028 d = d | (unsigned int) *tmp;
1029 in++;
1030 } else {
1031 d= *in++;
1032 }
1033 if ((d & 0xFC00) == 0xDC00) {
1034 c &= 0x03FF;
1035 c <<= 10;
1036 c |= d & 0x03FF;
1037 c += 0x10000;
1038 }
1039 else {
1040 *outlen = out - outstart;
1041 *inlenb = processed - inb;
1042 return(-2);
1043 }
1044 }
1045
1046 /* assertion: c is a single UTF-4 value */
1047 if (out >= outend)
1048 break;
1049 if (c < 0x80) { *out++= c; bits= -6; }
1050 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1051 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1052 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1053
1054 for ( ; bits >= 0; bits-= 6) {
1055 if (out >= outend)
1056 break;
1057 *out++= ((c >> bits) & 0x3F) | 0x80;
1058 }
1059 processed = (const unsigned char*) in;
1060 }
1061 *outlen = out - outstart;
1062 *inlenb = processed - inb;
1063 return(0);
1064}
1065
1066/**
1067 * UTF8ToUTF16BE:
1068 * @outb: a pointer to an array of bytes to store the result
1069 * @outlen: the length of @outb
1070 * @in: a pointer to an array of UTF-8 chars
1071 * @inlen: the length of @in
1072 *
1073 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1074 * block of chars out.
1075 *
1076 * Returns the number of byte written, or -1 by lack of space, or -2
1077 * if the transcoding failed.
1078 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001079static int
Owen Taylor3473f882001-02-23 17:55:21 +00001080UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1081 const unsigned char* in, int *inlen)
1082{
1083 unsigned short* out = (unsigned short*) outb;
1084 const unsigned char* processed = in;
1085 unsigned short* outstart= out;
1086 unsigned short* outend;
1087 const unsigned char* inend= in+*inlen;
1088 unsigned int c, d;
1089 int trailing;
1090 unsigned char *tmp;
1091 unsigned short tmp1, tmp2;
1092
1093 if (in == NULL) {
1094 /*
1095 * initialization, add the Byte Order Mark
1096 */
1097 if (*outlen >= 2) {
1098 outb[0] = 0xFE;
1099 outb[1] = 0xFF;
1100 *outlen = 2;
1101 *inlen = 0;
1102#ifdef DEBUG_ENCODING
1103 xmlGenericError(xmlGenericErrorContext,
1104 "Added FEFF Byte Order Mark\n");
1105#endif
1106 return(2);
1107 }
1108 *outlen = 0;
1109 *inlen = 0;
1110 return(0);
1111 }
1112 outend = out + (*outlen / 2);
1113 while (in < inend) {
1114 d= *in++;
1115 if (d < 0x80) { c= d; trailing= 0; }
1116 else if (d < 0xC0) {
1117 /* trailing byte in leading position */
1118 *outlen = out - outstart;
1119 *inlen = processed - in;
1120 return(-2);
1121 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1122 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1123 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1124 else {
1125 /* no chance for this in UTF-16 */
1126 *outlen = out - outstart;
1127 *inlen = processed - in;
1128 return(-2);
1129 }
1130
1131 if (inend - in < trailing) {
1132 break;
1133 }
1134
1135 for ( ; trailing; trailing--) {
1136 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1137 c <<= 6;
1138 c |= d & 0x3F;
1139 }
1140
1141 /* assertion: c is a single UTF-4 value */
1142 if (c < 0x10000) {
1143 if (out >= outend) break;
1144 if (xmlLittleEndian) {
1145 tmp = (unsigned char *) out;
1146 *tmp = c >> 8;
1147 *(tmp + 1) = c;
1148 out++;
1149 } else {
1150 *out++ = c;
1151 }
1152 }
1153 else if (c < 0x110000) {
1154 if (out+1 >= outend) break;
1155 c -= 0x10000;
1156 if (xmlLittleEndian) {
1157 tmp1 = 0xD800 | (c >> 10);
1158 tmp = (unsigned char *) out;
1159 *tmp = tmp1 >> 8;
1160 *(tmp + 1) = (unsigned char) tmp1;
1161 out++;
1162
1163 tmp2 = 0xDC00 | (c & 0x03FF);
1164 tmp = (unsigned char *) out;
1165 *tmp = tmp2 >> 8;
1166 *(tmp + 1) = (unsigned char) tmp2;
1167 out++;
1168 } else {
1169 *out++ = 0xD800 | (c >> 10);
1170 *out++ = 0xDC00 | (c & 0x03FF);
1171 }
1172 }
1173 else
1174 break;
1175 processed = in;
1176 }
1177 *outlen = (out - outstart) * 2;
1178 *inlen = processed - in;
1179 return(0);
1180}
1181
Daniel Veillard97ac1312001-05-30 19:14:17 +00001182/************************************************************************
1183 * *
1184 * Generic encoding handling routines *
1185 * *
1186 ************************************************************************/
1187
Owen Taylor3473f882001-02-23 17:55:21 +00001188/**
1189 * xmlDetectCharEncoding:
1190 * @in: a pointer to the first bytes of the XML entity, must be at least
1191 * 4 bytes long.
1192 * @len: pointer to the length of the buffer
1193 *
1194 * Guess the encoding of the entity using the first bytes of the entity content
1195 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1196 *
1197 * Returns one of the XML_CHAR_ENCODING_... values.
1198 */
1199xmlCharEncoding
1200xmlDetectCharEncoding(const unsigned char* in, int len)
1201{
1202 if (len >= 4) {
1203 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1204 (in[2] == 0x00) && (in[3] == 0x3C))
1205 return(XML_CHAR_ENCODING_UCS4BE);
1206 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1207 (in[2] == 0x00) && (in[3] == 0x00))
1208 return(XML_CHAR_ENCODING_UCS4LE);
1209 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1210 (in[2] == 0x3C) && (in[3] == 0x00))
1211 return(XML_CHAR_ENCODING_UCS4_2143);
1212 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1213 (in[2] == 0x00) && (in[3] == 0x00))
1214 return(XML_CHAR_ENCODING_UCS4_3412);
1215 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1216 (in[2] == 0xA7) && (in[3] == 0x94))
1217 return(XML_CHAR_ENCODING_EBCDIC);
1218 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1219 (in[2] == 0x78) && (in[3] == 0x6D))
1220 return(XML_CHAR_ENCODING_UTF8);
1221 }
Daniel Veillard87a764e2001-06-20 17:41:10 +00001222 if (len >= 3) {
1223 /*
1224 * Errata on XML-1.0 June 20 2001
1225 * We now allow an UTF8 encoded BOM
1226 */
1227 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1228 (in[2] == 0xBF))
1229 return(XML_CHAR_ENCODING_UTF8);
1230 }
Owen Taylor3473f882001-02-23 17:55:21 +00001231 if (len >= 2) {
1232 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1233 return(XML_CHAR_ENCODING_UTF16BE);
1234 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1235 return(XML_CHAR_ENCODING_UTF16LE);
1236 }
1237 return(XML_CHAR_ENCODING_NONE);
1238}
1239
1240/**
1241 * xmlCleanupEncodingAliases:
1242 *
1243 * Unregisters all aliases
1244 */
1245void
1246xmlCleanupEncodingAliases(void) {
1247 int i;
1248
1249 if (xmlCharEncodingAliases == NULL)
1250 return;
1251
1252 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1253 if (xmlCharEncodingAliases[i].name != NULL)
1254 xmlFree((char *) xmlCharEncodingAliases[i].name);
1255 if (xmlCharEncodingAliases[i].alias != NULL)
1256 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1257 }
1258 xmlCharEncodingAliasesNb = 0;
1259 xmlCharEncodingAliasesMax = 0;
1260 xmlFree(xmlCharEncodingAliases);
Daniel Veillard73c6e532002-01-08 13:15:33 +00001261 xmlCharEncodingAliases = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001262}
1263
1264/**
1265 * xmlGetEncodingAlias:
1266 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1267 *
1268 * Lookup an encoding name for the given alias.
1269 *
1270 * Returns NULL if not found the original name otherwise
1271 */
1272const char *
1273xmlGetEncodingAlias(const char *alias) {
1274 int i;
1275 char upper[100];
1276
1277 if (alias == NULL)
1278 return(NULL);
1279
1280 if (xmlCharEncodingAliases == NULL)
1281 return(NULL);
1282
1283 for (i = 0;i < 99;i++) {
1284 upper[i] = toupper(alias[i]);
1285 if (upper[i] == 0) break;
1286 }
1287 upper[i] = 0;
1288
1289 /*
1290 * Walk down the list looking for a definition of the alias
1291 */
1292 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1293 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1294 return(xmlCharEncodingAliases[i].name);
1295 }
1296 }
1297 return(NULL);
1298}
1299
1300/**
1301 * xmlAddEncodingAlias:
1302 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1303 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1304 *
1305 * Registers and alias @alias for an encoding named @name. Existing alias
1306 * will be overwritten.
1307 *
1308 * Returns 0 in case of success, -1 in case of error
1309 */
1310int
1311xmlAddEncodingAlias(const char *name, const char *alias) {
1312 int i;
1313 char upper[100];
1314
1315 if ((name == NULL) || (alias == NULL))
1316 return(-1);
1317
1318 for (i = 0;i < 99;i++) {
1319 upper[i] = toupper(alias[i]);
1320 if (upper[i] == 0) break;
1321 }
1322 upper[i] = 0;
1323
1324 if (xmlCharEncodingAliases == NULL) {
1325 xmlCharEncodingAliasesNb = 0;
1326 xmlCharEncodingAliasesMax = 20;
1327 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1328 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1329 if (xmlCharEncodingAliases == NULL)
1330 return(-1);
1331 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1332 xmlCharEncodingAliasesMax *= 2;
1333 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1334 xmlRealloc(xmlCharEncodingAliases,
1335 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1336 }
1337 /*
1338 * Walk down the list looking for a definition of the alias
1339 */
1340 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1341 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1342 /*
1343 * Replace the definition.
1344 */
1345 xmlFree((char *) xmlCharEncodingAliases[i].name);
1346 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1347 return(0);
1348 }
1349 }
1350 /*
1351 * Add the definition
1352 */
1353 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1354 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1355 xmlCharEncodingAliasesNb++;
1356 return(0);
1357}
1358
1359/**
1360 * xmlDelEncodingAlias:
1361 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1362 *
1363 * Unregisters an encoding alias @alias
1364 *
1365 * Returns 0 in case of success, -1 in case of error
1366 */
1367int
1368xmlDelEncodingAlias(const char *alias) {
1369 int i;
1370
1371 if (alias == NULL)
1372 return(-1);
1373
1374 if (xmlCharEncodingAliases == NULL)
1375 return(-1);
1376 /*
1377 * Walk down the list looking for a definition of the alias
1378 */
1379 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1380 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1381 xmlFree((char *) xmlCharEncodingAliases[i].name);
1382 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1383 xmlCharEncodingAliasesNb--;
1384 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1385 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1386 return(0);
1387 }
1388 }
1389 return(-1);
1390}
1391
1392/**
1393 * xmlParseCharEncoding:
1394 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1395 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001396 * Compare the string to the known encoding schemes already known. Note
Owen Taylor3473f882001-02-23 17:55:21 +00001397 * that the comparison is case insensitive accordingly to the section
1398 * [XML] 4.3.3 Character Encoding in Entities.
1399 *
1400 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1401 * if not recognized.
1402 */
1403xmlCharEncoding
1404xmlParseCharEncoding(const char* name)
1405{
1406 const char *alias;
1407 char upper[500];
1408 int i;
1409
1410 if (name == NULL)
1411 return(XML_CHAR_ENCODING_NONE);
1412
1413 /*
1414 * Do the alias resolution
1415 */
1416 alias = xmlGetEncodingAlias(name);
1417 if (alias != NULL)
1418 name = alias;
1419
1420 for (i = 0;i < 499;i++) {
1421 upper[i] = toupper(name[i]);
1422 if (upper[i] == 0) break;
1423 }
1424 upper[i] = 0;
1425
1426 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1427 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1428 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1429
1430 /*
1431 * NOTE: if we were able to parse this, the endianness of UTF16 is
1432 * already found and in use
1433 */
1434 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1435 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1436
1437 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1438 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1439 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1440
1441 /*
1442 * NOTE: if we were able to parse this, the endianness of UCS4 is
1443 * already found and in use
1444 */
1445 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1446 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1447 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1448
1449
1450 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1451 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1452 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1453
1454 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1455 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1456 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1457
1458 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1459 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1460 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1461 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1462 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1463 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1464 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1465
1466 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1467 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1468 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1469
1470#ifdef DEBUG_ENCODING
1471 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1472#endif
1473 return(XML_CHAR_ENCODING_ERROR);
1474}
1475
1476/**
1477 * xmlGetCharEncodingName:
1478 * @enc: the encoding
1479 *
1480 * The "canonical" name for XML encoding.
1481 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1482 * Section 4.3.3 Character Encoding in Entities
1483 *
1484 * Returns the canonical name for the given encoding
1485 */
1486
1487const char*
1488xmlGetCharEncodingName(xmlCharEncoding enc) {
1489 switch (enc) {
1490 case XML_CHAR_ENCODING_ERROR:
1491 return(NULL);
1492 case XML_CHAR_ENCODING_NONE:
1493 return(NULL);
1494 case XML_CHAR_ENCODING_UTF8:
1495 return("UTF-8");
1496 case XML_CHAR_ENCODING_UTF16LE:
1497 return("UTF-16");
1498 case XML_CHAR_ENCODING_UTF16BE:
1499 return("UTF-16");
1500 case XML_CHAR_ENCODING_EBCDIC:
1501 return("EBCDIC");
1502 case XML_CHAR_ENCODING_UCS4LE:
1503 return("ISO-10646-UCS-4");
1504 case XML_CHAR_ENCODING_UCS4BE:
1505 return("ISO-10646-UCS-4");
1506 case XML_CHAR_ENCODING_UCS4_2143:
1507 return("ISO-10646-UCS-4");
1508 case XML_CHAR_ENCODING_UCS4_3412:
1509 return("ISO-10646-UCS-4");
1510 case XML_CHAR_ENCODING_UCS2:
1511 return("ISO-10646-UCS-2");
1512 case XML_CHAR_ENCODING_8859_1:
1513 return("ISO-8859-1");
1514 case XML_CHAR_ENCODING_8859_2:
1515 return("ISO-8859-2");
1516 case XML_CHAR_ENCODING_8859_3:
1517 return("ISO-8859-3");
1518 case XML_CHAR_ENCODING_8859_4:
1519 return("ISO-8859-4");
1520 case XML_CHAR_ENCODING_8859_5:
1521 return("ISO-8859-5");
1522 case XML_CHAR_ENCODING_8859_6:
1523 return("ISO-8859-6");
1524 case XML_CHAR_ENCODING_8859_7:
1525 return("ISO-8859-7");
1526 case XML_CHAR_ENCODING_8859_8:
1527 return("ISO-8859-8");
1528 case XML_CHAR_ENCODING_8859_9:
1529 return("ISO-8859-9");
1530 case XML_CHAR_ENCODING_2022_JP:
1531 return("ISO-2022-JP");
1532 case XML_CHAR_ENCODING_SHIFT_JIS:
1533 return("Shift-JIS");
1534 case XML_CHAR_ENCODING_EUC_JP:
1535 return("EUC-JP");
1536 case XML_CHAR_ENCODING_ASCII:
1537 return(NULL);
1538 }
1539 return(NULL);
1540}
1541
Daniel Veillard97ac1312001-05-30 19:14:17 +00001542/************************************************************************
1543 * *
1544 * Char encoding handlers *
1545 * *
1546 ************************************************************************/
1547
Owen Taylor3473f882001-02-23 17:55:21 +00001548
1549/* the size should be growable, but it's not a big deal ... */
1550#define MAX_ENCODING_HANDLERS 50
1551static xmlCharEncodingHandlerPtr *handlers = NULL;
1552static int nbCharEncodingHandler = 0;
1553
1554/*
1555 * The default is UTF-8 for XML, that's also the default used for the
1556 * parser internals, so the default encoding handler is NULL
1557 */
1558
1559static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1560
1561/**
1562 * xmlNewCharEncodingHandler:
1563 * @name: the encoding name, in UTF-8 format (ASCII actually)
1564 * @input: the xmlCharEncodingInputFunc to read that encoding
1565 * @output: the xmlCharEncodingOutputFunc to write that encoding
1566 *
1567 * Create and registers an xmlCharEncodingHandler.
Daniel Veillard6f46f6c2002-08-01 12:22:24 +00001568 *
Owen Taylor3473f882001-02-23 17:55:21 +00001569 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1570 */
Daniel Veillard6f46f6c2002-08-01 12:22:24 +00001571xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001572xmlNewCharEncodingHandler(const char *name,
1573 xmlCharEncodingInputFunc input,
1574 xmlCharEncodingOutputFunc output) {
1575 xmlCharEncodingHandlerPtr handler;
1576 const char *alias;
1577 char upper[500];
1578 int i;
1579 char *up = 0;
1580
1581 /*
1582 * Do the alias resolution
1583 */
1584 alias = xmlGetEncodingAlias(name);
1585 if (alias != NULL)
1586 name = alias;
1587
1588 /*
1589 * Keep only the uppercase version of the encoding.
1590 */
1591 if (name == NULL) {
1592 xmlGenericError(xmlGenericErrorContext,
1593 "xmlNewCharEncodingHandler : no name !\n");
1594 return(NULL);
1595 }
1596 for (i = 0;i < 499;i++) {
1597 upper[i] = toupper(name[i]);
1598 if (upper[i] == 0) break;
1599 }
1600 upper[i] = 0;
1601 up = xmlMemStrdup(upper);
1602 if (up == NULL) {
1603 xmlGenericError(xmlGenericErrorContext,
1604 "xmlNewCharEncodingHandler : out of memory !\n");
1605 return(NULL);
1606 }
1607
1608 /*
1609 * allocate and fill-up an handler block.
1610 */
1611 handler = (xmlCharEncodingHandlerPtr)
1612 xmlMalloc(sizeof(xmlCharEncodingHandler));
1613 if (handler == NULL) {
1614 xmlGenericError(xmlGenericErrorContext,
1615 "xmlNewCharEncodingHandler : out of memory !\n");
1616 return(NULL);
1617 }
1618 handler->input = input;
1619 handler->output = output;
1620 handler->name = up;
1621
1622#ifdef LIBXML_ICONV_ENABLED
1623 handler->iconv_in = NULL;
1624 handler->iconv_out = NULL;
1625#endif /* LIBXML_ICONV_ENABLED */
1626
1627 /*
1628 * registers and returns the handler.
1629 */
1630 xmlRegisterCharEncodingHandler(handler);
1631#ifdef DEBUG_ENCODING
1632 xmlGenericError(xmlGenericErrorContext,
1633 "Registered encoding handler for %s\n", name);
1634#endif
1635 return(handler);
1636}
1637
1638/**
1639 * xmlInitCharEncodingHandlers:
1640 *
1641 * Initialize the char encoding support, it registers the default
1642 * encoding supported.
1643 * NOTE: while public, this function usually doesn't need to be called
1644 * in normal processing.
1645 */
1646void
1647xmlInitCharEncodingHandlers(void) {
1648 unsigned short int tst = 0x1234;
1649 unsigned char *ptr = (unsigned char *) &tst;
1650
1651 if (handlers != NULL) return;
1652
1653 handlers = (xmlCharEncodingHandlerPtr *)
1654 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1655
1656 if (*ptr == 0x12) xmlLittleEndian = 0;
1657 else if (*ptr == 0x34) xmlLittleEndian = 1;
1658 else xmlGenericError(xmlGenericErrorContext,
1659 "Odd problem at endianness detection\n");
1660
1661 if (handlers == NULL) {
1662 xmlGenericError(xmlGenericErrorContext,
1663 "xmlInitCharEncodingHandlers : out of memory !\n");
1664 return;
1665 }
Daniel Veillard81601f92003-01-14 13:42:37 +00001666 xmlNewCharEncodingHandler("UTF-8", UTF8ToUTF8, UTF8ToUTF8);
Owen Taylor3473f882001-02-23 17:55:21 +00001667 xmlUTF16LEHandler =
1668 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1669 xmlUTF16BEHandler =
1670 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1671 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1672 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001673 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001674#ifdef LIBXML_HTML_ENABLED
1675 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1676#endif
1677}
1678
1679/**
1680 * xmlCleanupCharEncodingHandlers:
1681 *
1682 * Cleanup the memory allocated for the char encoding support, it
1683 * unregisters all the encoding handlers and the aliases.
1684 */
1685void
1686xmlCleanupCharEncodingHandlers(void) {
1687 xmlCleanupEncodingAliases();
1688
1689 if (handlers == NULL) return;
1690
1691 for (;nbCharEncodingHandler > 0;) {
1692 nbCharEncodingHandler--;
1693 if (handlers[nbCharEncodingHandler] != NULL) {
1694 if (handlers[nbCharEncodingHandler]->name != NULL)
1695 xmlFree(handlers[nbCharEncodingHandler]->name);
1696 xmlFree(handlers[nbCharEncodingHandler]);
1697 }
1698 }
1699 xmlFree(handlers);
1700 handlers = NULL;
1701 nbCharEncodingHandler = 0;
1702 xmlDefaultCharEncodingHandler = NULL;
1703}
1704
1705/**
1706 * xmlRegisterCharEncodingHandler:
1707 * @handler: the xmlCharEncodingHandlerPtr handler block
1708 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001709 * Register the char encoding handler, surprising, isn't it ?
Owen Taylor3473f882001-02-23 17:55:21 +00001710 */
1711void
1712xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1713 if (handlers == NULL) xmlInitCharEncodingHandlers();
1714 if (handler == NULL) {
1715 xmlGenericError(xmlGenericErrorContext,
1716 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1717 return;
1718 }
1719
1720 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1721 xmlGenericError(xmlGenericErrorContext,
1722 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1723 xmlGenericError(xmlGenericErrorContext,
1724 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1725 return;
1726 }
1727 handlers[nbCharEncodingHandler++] = handler;
1728}
1729
1730/**
1731 * xmlGetCharEncodingHandler:
1732 * @enc: an xmlCharEncoding value.
1733 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001734 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001735 *
1736 * Returns the handler or NULL if not found
1737 */
1738xmlCharEncodingHandlerPtr
1739xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1740 xmlCharEncodingHandlerPtr handler;
1741
1742 if (handlers == NULL) xmlInitCharEncodingHandlers();
1743 switch (enc) {
1744 case XML_CHAR_ENCODING_ERROR:
1745 return(NULL);
1746 case XML_CHAR_ENCODING_NONE:
1747 return(NULL);
1748 case XML_CHAR_ENCODING_UTF8:
1749 return(NULL);
1750 case XML_CHAR_ENCODING_UTF16LE:
1751 return(xmlUTF16LEHandler);
1752 case XML_CHAR_ENCODING_UTF16BE:
1753 return(xmlUTF16BEHandler);
1754 case XML_CHAR_ENCODING_EBCDIC:
1755 handler = xmlFindCharEncodingHandler("EBCDIC");
1756 if (handler != NULL) return(handler);
1757 handler = xmlFindCharEncodingHandler("ebcdic");
1758 if (handler != NULL) return(handler);
1759 break;
1760 case XML_CHAR_ENCODING_UCS4BE:
1761 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1762 if (handler != NULL) return(handler);
1763 handler = xmlFindCharEncodingHandler("UCS-4");
1764 if (handler != NULL) return(handler);
1765 handler = xmlFindCharEncodingHandler("UCS4");
1766 if (handler != NULL) return(handler);
1767 break;
1768 case XML_CHAR_ENCODING_UCS4LE:
1769 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1770 if (handler != NULL) return(handler);
1771 handler = xmlFindCharEncodingHandler("UCS-4");
1772 if (handler != NULL) return(handler);
1773 handler = xmlFindCharEncodingHandler("UCS4");
1774 if (handler != NULL) return(handler);
1775 break;
1776 case XML_CHAR_ENCODING_UCS4_2143:
1777 break;
1778 case XML_CHAR_ENCODING_UCS4_3412:
1779 break;
1780 case XML_CHAR_ENCODING_UCS2:
1781 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1782 if (handler != NULL) return(handler);
1783 handler = xmlFindCharEncodingHandler("UCS-2");
1784 if (handler != NULL) return(handler);
1785 handler = xmlFindCharEncodingHandler("UCS2");
1786 if (handler != NULL) return(handler);
1787 break;
1788
1789 /*
1790 * We used to keep ISO Latin encodings native in the
1791 * generated data. This led to so many problems that
1792 * this has been removed. One can still change this
1793 * back by registering no-ops encoders for those
1794 */
1795 case XML_CHAR_ENCODING_8859_1:
1796 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1797 if (handler != NULL) return(handler);
1798 break;
1799 case XML_CHAR_ENCODING_8859_2:
1800 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1801 if (handler != NULL) return(handler);
1802 break;
1803 case XML_CHAR_ENCODING_8859_3:
1804 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1805 if (handler != NULL) return(handler);
1806 break;
1807 case XML_CHAR_ENCODING_8859_4:
1808 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1809 if (handler != NULL) return(handler);
1810 break;
1811 case XML_CHAR_ENCODING_8859_5:
1812 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1813 if (handler != NULL) return(handler);
1814 break;
1815 case XML_CHAR_ENCODING_8859_6:
1816 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1817 if (handler != NULL) return(handler);
1818 break;
1819 case XML_CHAR_ENCODING_8859_7:
1820 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1821 if (handler != NULL) return(handler);
1822 break;
1823 case XML_CHAR_ENCODING_8859_8:
1824 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1825 if (handler != NULL) return(handler);
1826 break;
1827 case XML_CHAR_ENCODING_8859_9:
1828 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1829 if (handler != NULL) return(handler);
1830 break;
1831
1832
1833 case XML_CHAR_ENCODING_2022_JP:
1834 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1835 if (handler != NULL) return(handler);
1836 break;
1837 case XML_CHAR_ENCODING_SHIFT_JIS:
1838 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1839 if (handler != NULL) return(handler);
1840 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1841 if (handler != NULL) return(handler);
1842 handler = xmlFindCharEncodingHandler("Shift_JIS");
1843 if (handler != NULL) return(handler);
1844 break;
1845 case XML_CHAR_ENCODING_EUC_JP:
1846 handler = xmlFindCharEncodingHandler("EUC-JP");
1847 if (handler != NULL) return(handler);
1848 break;
1849 default:
1850 break;
1851 }
1852
1853#ifdef DEBUG_ENCODING
1854 xmlGenericError(xmlGenericErrorContext,
1855 "No handler found for encoding %d\n", enc);
1856#endif
1857 return(NULL);
1858}
1859
1860/**
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001861 * xmlFindCharEncodingHandler:
1862 * @name: a string describing the char encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001863 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001864 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001865 *
1866 * Returns the handler or NULL if not found
1867 */
1868xmlCharEncodingHandlerPtr
1869xmlFindCharEncodingHandler(const char *name) {
1870 const char *nalias;
1871 const char *norig;
1872 xmlCharEncoding alias;
1873#ifdef LIBXML_ICONV_ENABLED
1874 xmlCharEncodingHandlerPtr enc;
1875 iconv_t icv_in, icv_out;
1876#endif /* LIBXML_ICONV_ENABLED */
1877 char upper[100];
1878 int i;
1879
1880 if (handlers == NULL) xmlInitCharEncodingHandlers();
1881 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1882 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1883
1884 /*
1885 * Do the alias resolution
1886 */
1887 norig = name;
1888 nalias = xmlGetEncodingAlias(name);
1889 if (nalias != NULL)
1890 name = nalias;
1891
1892 /*
1893 * Check first for directly registered encoding names
1894 */
1895 for (i = 0;i < 99;i++) {
1896 upper[i] = toupper(name[i]);
1897 if (upper[i] == 0) break;
1898 }
1899 upper[i] = 0;
1900
1901 for (i = 0;i < nbCharEncodingHandler; i++)
1902 if (!strcmp(upper, handlers[i]->name)) {
1903#ifdef DEBUG_ENCODING
1904 xmlGenericError(xmlGenericErrorContext,
1905 "Found registered handler for encoding %s\n", name);
1906#endif
1907 return(handlers[i]);
1908 }
1909
1910#ifdef LIBXML_ICONV_ENABLED
1911 /* check whether iconv can handle this */
1912 icv_in = iconv_open("UTF-8", name);
1913 icv_out = iconv_open(name, "UTF-8");
1914 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1915 enc = (xmlCharEncodingHandlerPtr)
1916 xmlMalloc(sizeof(xmlCharEncodingHandler));
1917 if (enc == NULL) {
1918 iconv_close(icv_in);
1919 iconv_close(icv_out);
1920 return(NULL);
1921 }
1922 enc->name = xmlMemStrdup(name);
1923 enc->input = NULL;
1924 enc->output = NULL;
1925 enc->iconv_in = icv_in;
1926 enc->iconv_out = icv_out;
1927#ifdef DEBUG_ENCODING
1928 xmlGenericError(xmlGenericErrorContext,
1929 "Found iconv handler for encoding %s\n", name);
1930#endif
1931 return enc;
1932 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1933 xmlGenericError(xmlGenericErrorContext,
1934 "iconv : problems with filters for '%s'\n", name);
1935 }
1936#endif /* LIBXML_ICONV_ENABLED */
1937
1938#ifdef DEBUG_ENCODING
1939 xmlGenericError(xmlGenericErrorContext,
1940 "No handler found for encoding %s\n", name);
1941#endif
1942
1943 /*
1944 * Fallback using the canonical names
1945 */
1946 alias = xmlParseCharEncoding(norig);
1947 if (alias != XML_CHAR_ENCODING_ERROR) {
1948 const char* canon;
1949 canon = xmlGetCharEncodingName(alias);
1950 if ((canon != NULL) && (strcmp(name, canon))) {
1951 return(xmlFindCharEncodingHandler(canon));
1952 }
1953 }
1954
1955 return(NULL);
1956}
1957
Daniel Veillard97ac1312001-05-30 19:14:17 +00001958/************************************************************************
1959 * *
1960 * ICONV based generic conversion functions *
1961 * *
1962 ************************************************************************/
1963
Owen Taylor3473f882001-02-23 17:55:21 +00001964#ifdef LIBXML_ICONV_ENABLED
1965/**
1966 * xmlIconvWrapper:
1967 * @cd: iconv converter data structure
1968 * @out: a pointer to an array of bytes to store the result
1969 * @outlen: the length of @out
1970 * @in: a pointer to an array of ISO Latin 1 chars
1971 * @inlen: the length of @in
1972 *
1973 * Returns 0 if success, or
1974 * -1 by lack of space, or
1975 * -2 if the transcoding fails (for *in is not valid utf8 string or
1976 * the result of transformation can't fit into the encoding we want), or
1977 * -3 if there the last byte can't form a single output char.
1978 *
1979 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001980 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001981 * The value of @outlen after return is the number of ocetes consumed.
1982 */
1983static int
1984xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001985 unsigned char *out, int *outlen,
1986 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001987
Daniel Veillard9403a042001-05-28 11:00:53 +00001988 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1989 const char *icv_in = (const char *) in;
1990 char *icv_out = (char *) out;
1991 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001992
Darin Adler699613b2001-07-27 22:47:14 +00001993 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard9403a042001-05-28 11:00:53 +00001994 if (in != NULL) {
1995 *inlen -= icv_inlen;
1996 *outlen -= icv_outlen;
1997 } else {
1998 *inlen = 0;
1999 *outlen = 0;
2000 }
2001 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00002002#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00002003 if (errno == EILSEQ) {
2004 return -2;
2005 } else
Owen Taylor3473f882001-02-23 17:55:21 +00002006#endif
2007#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00002008 if (errno == E2BIG) {
2009 return -1;
2010 } else
Owen Taylor3473f882001-02-23 17:55:21 +00002011#endif
2012#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00002013 if (errno == EINVAL) {
2014 return -3;
2015 } else
Owen Taylor3473f882001-02-23 17:55:21 +00002016#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00002017 {
2018 return -3;
2019 }
2020 }
2021 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002022}
2023#endif /* LIBXML_ICONV_ENABLED */
2024
Daniel Veillard97ac1312001-05-30 19:14:17 +00002025/************************************************************************
2026 * *
2027 * The real API used by libxml for on-the-fly conversion *
2028 * *
2029 ************************************************************************/
2030
Owen Taylor3473f882001-02-23 17:55:21 +00002031/**
2032 * xmlCharEncFirstLine:
2033 * @handler: char enconding transformation data structure
2034 * @out: an xmlBuffer for the output.
2035 * @in: an xmlBuffer for the input
2036 *
2037 * Front-end for the encoding handler input function, but handle only
2038 * the very first line, i.e. limit itself to 45 chars.
2039 *
2040 * Returns the number of byte written if success, or
2041 * -1 general error
2042 * -2 if the transcoding fails (for *in is not valid utf8 string or
2043 * the result of transformation can't fit into the encoding we want), or
2044 */
2045int
2046xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2047 xmlBufferPtr in) {
2048 int ret = -2;
2049 int written;
2050 int toconv;
2051
2052 if (handler == NULL) return(-1);
2053 if (out == NULL) return(-1);
2054 if (in == NULL) return(-1);
2055
2056 written = out->size - out->use;
2057 toconv = in->use;
2058 if (toconv * 2 >= written) {
2059 xmlBufferGrow(out, toconv);
2060 written = out->size - out->use - 1;
2061 }
2062
2063 /*
2064 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
2065 * 45 chars should be sufficient to reach the end of the encoding
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002066 * declaration without going too far inside the document content.
Owen Taylor3473f882001-02-23 17:55:21 +00002067 */
2068 written = 45;
2069
2070 if (handler->input != NULL) {
2071 ret = handler->input(&out->content[out->use], &written,
2072 in->content, &toconv);
2073 xmlBufferShrink(in, toconv);
2074 out->use += written;
2075 out->content[out->use] = 0;
2076 }
2077#ifdef LIBXML_ICONV_ENABLED
2078 else if (handler->iconv_in != NULL) {
2079 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2080 &written, in->content, &toconv);
2081 xmlBufferShrink(in, toconv);
2082 out->use += written;
2083 out->content[out->use] = 0;
2084 if (ret == -1) ret = -3;
2085 }
2086#endif /* LIBXML_ICONV_ENABLED */
2087#ifdef DEBUG_ENCODING
2088 switch (ret) {
2089 case 0:
2090 xmlGenericError(xmlGenericErrorContext,
2091 "converted %d bytes to %d bytes of input\n",
2092 toconv, written);
2093 break;
2094 case -1:
2095 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2096 toconv, written, in->use);
2097 break;
2098 case -2:
2099 xmlGenericError(xmlGenericErrorContext,
2100 "input conversion failed due to input error\n");
2101 break;
2102 case -3:
2103 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2104 toconv, written, in->use);
2105 break;
2106 default:
2107 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2108 }
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002109#endif /* DEBUG_ENCODING */
Owen Taylor3473f882001-02-23 17:55:21 +00002110 /*
2111 * Ignore when input buffer is not on a boundary
2112 */
2113 if (ret == -3) ret = 0;
2114 if (ret == -1) ret = 0;
2115 return(ret);
2116}
2117
2118/**
2119 * xmlCharEncInFunc:
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002120 * @handler: char encoding transformation data structure
Owen Taylor3473f882001-02-23 17:55:21 +00002121 * @out: an xmlBuffer for the output.
2122 * @in: an xmlBuffer for the input
2123 *
2124 * Generic front-end for the encoding handler input function
2125 *
2126 * Returns the number of byte written if success, or
2127 * -1 general error
2128 * -2 if the transcoding fails (for *in is not valid utf8 string or
2129 * the result of transformation can't fit into the encoding we want), or
2130 */
2131int
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002132xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2133 xmlBufferPtr in)
2134{
Owen Taylor3473f882001-02-23 17:55:21 +00002135 int ret = -2;
2136 int written;
2137 int toconv;
2138
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002139 if (handler == NULL)
2140 return (-1);
2141 if (out == NULL)
2142 return (-1);
2143 if (in == NULL)
2144 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00002145
2146 toconv = in->use;
2147 if (toconv == 0)
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002148 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00002149 written = out->size - out->use;
2150 if (toconv * 2 >= written) {
2151 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002152 written = out->size - out->use - 1;
Owen Taylor3473f882001-02-23 17:55:21 +00002153 }
2154 if (handler->input != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002155 ret = handler->input(&out->content[out->use], &written,
2156 in->content, &toconv);
2157 xmlBufferShrink(in, toconv);
2158 out->use += written;
2159 out->content[out->use] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002160 }
2161#ifdef LIBXML_ICONV_ENABLED
2162 else if (handler->iconv_in != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002163 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2164 &written, in->content, &toconv);
2165 xmlBufferShrink(in, toconv);
2166 out->use += written;
2167 out->content[out->use] = 0;
2168 if (ret == -1)
2169 ret = -3;
Owen Taylor3473f882001-02-23 17:55:21 +00002170 }
2171#endif /* LIBXML_ICONV_ENABLED */
2172 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002173 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002174#ifdef DEBUG_ENCODING
2175 xmlGenericError(xmlGenericErrorContext,
2176 "converted %d bytes to %d bytes of input\n",
2177 toconv, written);
Owen Taylor3473f882001-02-23 17:55:21 +00002178#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002179 break;
2180 case -1:
2181#ifdef DEBUG_ENCODING
2182 xmlGenericError(xmlGenericErrorContext,
2183 "converted %d bytes to %d bytes of input, %d left\n",
2184 toconv, written, in->use);
2185#endif
2186 break;
2187 case -3:
2188#ifdef DEBUG_ENCODING
2189 xmlGenericError(xmlGenericErrorContext,
2190 "converted %d bytes to %d bytes of input, %d left\n",
2191 toconv, written, in->use);
2192#endif
2193 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002194 case -2:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002195 xmlGenericError(xmlGenericErrorContext,
2196 "input conversion failed due to input error\n");
2197 xmlGenericError(xmlGenericErrorContext,
2198 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2199 in->content[0], in->content[1],
2200 in->content[2], in->content[3]);
Owen Taylor3473f882001-02-23 17:55:21 +00002201 }
2202 /*
2203 * Ignore when input buffer is not on a boundary
2204 */
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002205 if (ret == -3)
2206 ret = 0;
Daniel Veillardd076a202002-11-20 13:28:31 +00002207 return (written);
Owen Taylor3473f882001-02-23 17:55:21 +00002208}
2209
2210/**
2211 * xmlCharEncOutFunc:
2212 * @handler: char enconding transformation data structure
2213 * @out: an xmlBuffer for the output.
2214 * @in: an xmlBuffer for the input
2215 *
2216 * Generic front-end for the encoding handler output function
2217 * a first call with @in == NULL has to be made firs to initiate the
2218 * output in case of non-stateless encoding needing to initiate their
2219 * state or the output (like the BOM in UTF16).
2220 * In case of UTF8 sequence conversion errors for the given encoder,
2221 * the content will be automatically remapped to a CharRef sequence.
2222 *
2223 * Returns the number of byte written if success, or
2224 * -1 general error
2225 * -2 if the transcoding fails (for *in is not valid utf8 string or
2226 * the result of transformation can't fit into the encoding we want), or
2227 */
2228int
2229xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2230 xmlBufferPtr in) {
2231 int ret = -2;
2232 int written;
2233 int writtentot = 0;
2234 int toconv;
2235 int output = 0;
2236
2237 if (handler == NULL) return(-1);
2238 if (out == NULL) return(-1);
2239
2240retry:
2241
2242 written = out->size - out->use;
2243
Igor Zlatkovic73267db2003-03-08 13:29:24 +00002244 if (written > 0)
2245 written--; /* Gennady: count '/0' */
2246
Owen Taylor3473f882001-02-23 17:55:21 +00002247 /*
2248 * First specific handling of in = NULL, i.e. the initialization call
2249 */
2250 if (in == NULL) {
2251 toconv = 0;
2252 if (handler->output != NULL) {
2253 ret = handler->output(&out->content[out->use], &written,
2254 NULL, &toconv);
Daniel Veillard8caa9c22003-06-02 13:35:24 +00002255 if (ret >= 0) { /* Gennady: check return value */
Igor Zlatkovic73267db2003-03-08 13:29:24 +00002256 out->use += written;
2257 out->content[out->use] = 0;
2258 }
Owen Taylor3473f882001-02-23 17:55:21 +00002259 }
2260#ifdef LIBXML_ICONV_ENABLED
2261 else if (handler->iconv_out != NULL) {
2262 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2263 &written, NULL, &toconv);
2264 out->use += written;
2265 out->content[out->use] = 0;
2266 }
2267#endif /* LIBXML_ICONV_ENABLED */
2268#ifdef DEBUG_ENCODING
2269 xmlGenericError(xmlGenericErrorContext,
2270 "initialized encoder\n");
2271#endif
2272 return(0);
2273 }
2274
2275 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002276 * Conversion itself.
Owen Taylor3473f882001-02-23 17:55:21 +00002277 */
2278 toconv = in->use;
2279 if (toconv == 0)
2280 return(0);
2281 if (toconv * 2 >= written) {
2282 xmlBufferGrow(out, toconv * 2);
2283 written = out->size - out->use - 1;
2284 }
2285 if (handler->output != NULL) {
2286 ret = handler->output(&out->content[out->use], &written,
2287 in->content, &toconv);
2288 xmlBufferShrink(in, toconv);
2289 out->use += written;
2290 writtentot += written;
2291 out->content[out->use] = 0;
2292 }
2293#ifdef LIBXML_ICONV_ENABLED
2294 else if (handler->iconv_out != NULL) {
2295 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2296 &written, in->content, &toconv);
2297 xmlBufferShrink(in, toconv);
2298 out->use += written;
2299 writtentot += written;
2300 out->content[out->use] = 0;
2301 if (ret == -1) {
2302 if (written > 0) {
2303 /*
2304 * Can be a limitation of iconv
2305 */
2306 goto retry;
2307 }
2308 ret = -3;
2309 }
2310 }
2311#endif /* LIBXML_ICONV_ENABLED */
2312 else {
2313 xmlGenericError(xmlGenericErrorContext,
2314 "xmlCharEncOutFunc: no output function !\n");
2315 return(-1);
2316 }
2317
2318 if (ret >= 0) output += ret;
2319
2320 /*
2321 * Attempt to handle error cases
2322 */
2323 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002324 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002325#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002326 xmlGenericError(xmlGenericErrorContext,
2327 "converted %d bytes to %d bytes of output\n",
2328 toconv, written);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002329#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002330 break;
2331 case -1:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002332#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002333 xmlGenericError(xmlGenericErrorContext,
2334 "output conversion failed by lack of space\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002335#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002336 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002337 case -3:
Daniel Veillard809faa52003-02-10 15:43:53 +00002338#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002339 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2340 toconv, written, in->use);
Daniel Veillard809faa52003-02-10 15:43:53 +00002341#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002342 break;
2343 case -2: {
2344 int len = in->use;
2345 const xmlChar *utf = (const xmlChar *) in->content;
2346 int cur;
2347
2348 cur = xmlGetUTF8Char(utf, &len);
2349 if (cur > 0) {
2350 xmlChar charref[20];
2351
2352#ifdef DEBUG_ENCODING
2353 xmlGenericError(xmlGenericErrorContext,
2354 "handling output conversion error\n");
2355 xmlGenericError(xmlGenericErrorContext,
2356 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2357 in->content[0], in->content[1],
2358 in->content[2], in->content[3]);
2359#endif
2360 /*
2361 * Removes the UTF8 sequence, and replace it by a charref
2362 * and continue the transcoding phase, hoping the error
2363 * did not mangle the encoder state.
2364 */
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002365 snprintf((char *) charref, sizeof(charref), "&#%d;", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002366 xmlBufferShrink(in, len);
2367 xmlBufferAddHead(in, charref, -1);
2368
2369 goto retry;
2370 } else {
2371 xmlGenericError(xmlGenericErrorContext,
2372 "output conversion failed due to conv error\n");
2373 xmlGenericError(xmlGenericErrorContext,
2374 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2375 in->content[0], in->content[1],
2376 in->content[2], in->content[3]);
2377 in->content[0] = ' ';
2378 }
2379 break;
2380 }
2381 }
2382 return(ret);
2383}
2384
2385/**
2386 * xmlCharEncCloseFunc:
2387 * @handler: char enconding transformation data structure
2388 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002389 * Generic front-end for encoding handler close function
Owen Taylor3473f882001-02-23 17:55:21 +00002390 *
2391 * Returns 0 if success, or -1 in case of error
2392 */
2393int
2394xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2395 int ret = 0;
2396 if (handler == NULL) return(-1);
2397 if (handler->name == NULL) return(-1);
2398#ifdef LIBXML_ICONV_ENABLED
2399 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002400 * Iconv handlers can be used only once, free the whole block.
Owen Taylor3473f882001-02-23 17:55:21 +00002401 * and the associated icon resources.
2402 */
2403 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2404 if (handler->name != NULL)
2405 xmlFree(handler->name);
2406 handler->name = NULL;
2407 if (handler->iconv_out != NULL) {
2408 if (iconv_close(handler->iconv_out))
2409 ret = -1;
2410 handler->iconv_out = NULL;
2411 }
2412 if (handler->iconv_in != NULL) {
2413 if (iconv_close(handler->iconv_in))
2414 ret = -1;
2415 handler->iconv_in = NULL;
2416 }
2417 xmlFree(handler);
2418 }
2419#endif /* LIBXML_ICONV_ENABLED */
2420#ifdef DEBUG_ENCODING
2421 if (ret)
2422 xmlGenericError(xmlGenericErrorContext,
2423 "failed to close the encoding handler\n");
2424 else
2425 xmlGenericError(xmlGenericErrorContext,
2426 "closed the encoding handler\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002427#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002428
Owen Taylor3473f882001-02-23 17:55:21 +00002429 return(ret);
2430}
2431