blob: fab241e6d2fa98a1b08357c20bd39279bcf0b0a0 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
16 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
17 *
18 * See Copyright for the status of this software.
19 *
20 * Daniel.Veillard@w3.org
21 */
22
23#ifdef WIN32
24#include "win32config.h"
25#else
26#include "config.h"
27#endif
28
29#include <stdio.h>
30#include <string.h>
31
32#ifdef HAVE_CTYPE_H
33#include <ctype.h>
34#endif
35#ifdef HAVE_STDLIB_H
36#include <stdlib.h>
37#endif
38#include <libxml/xmlversion.h>
39#ifdef LIBXML_ICONV_ENABLED
40#ifdef HAVE_ERRNO_H
41#include <errno.h>
42#endif
43#endif
44#include <libxml/encoding.h>
45#include <libxml/xmlmemory.h>
46#ifdef LIBXML_HTML_ENABLED
47#include <libxml/HTMLparser.h>
48#endif
49#include <libxml/xmlerror.h>
50
51xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
52xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
53
54typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
55typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
56struct _xmlCharEncodingAlias {
57 const char *name;
58 const char *alias;
59};
60
61static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
62static int xmlCharEncodingAliasesNb = 0;
63static int xmlCharEncodingAliasesMax = 0;
64
65#ifdef LIBXML_ICONV_ENABLED
66#if 0
67#define DEBUG_ENCODING /* Define this to get encoding traces */
68#endif
69#endif
70
71static int xmlLittleEndian = 1;
72
73/*
74 * From rfc2044: encoding of the Unicode values on UTF-8:
75 *
76 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
77 * 0000 0000-0000 007F 0xxxxxxx
78 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
79 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
80 *
81 * I hope we won't use values > 0xFFFF anytime soon !
82 */
83
84/**
85 * xmlGetUTF8Char:
86 * @utf: a sequence of UTF-8 encoded bytes
87 * @len: a pointer to @bytes len
88 *
89 * Read one UTF8 Char from @utf
90 *
91 * Returns the char value or -1 in case of error and update @len with the
92 * number of bytes used
93 */
94int
95xmlGetUTF8Char(const unsigned char *utf, int *len) {
96 unsigned int c;
97
98 if (utf == NULL)
99 goto error;
100 if (len == NULL)
101 goto error;
102 if (*len < 1)
103 goto error;
104
105 c = utf[0];
106 if (c & 0x80) {
107 if (*len < 2)
108 goto error;
109 if ((utf[1] & 0xc0) != 0x80)
110 goto error;
111 if ((c & 0xe0) == 0xe0) {
112 if (*len < 3)
113 goto error;
114 if ((utf[2] & 0xc0) != 0x80)
115 goto error;
116 if ((c & 0xf0) == 0xf0) {
117 if (*len < 4)
118 goto error;
119 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
120 goto error;
121 *len = 4;
122 /* 4-byte code */
123 c = (utf[0] & 0x7) << 18;
124 c |= (utf[1] & 0x3f) << 12;
125 c |= (utf[2] & 0x3f) << 6;
126 c |= utf[3] & 0x3f;
127 } else {
128 /* 3-byte code */
129 *len = 3;
130 c = (utf[0] & 0xf) << 12;
131 c |= (utf[1] & 0x3f) << 6;
132 c |= utf[2] & 0x3f;
133 }
134 } else {
135 /* 2-byte code */
136 *len = 2;
137 c = (utf[0] & 0x1f) << 6;
138 c |= utf[1] & 0x3f;
139 }
140 } else {
141 /* 1-byte code */
142 *len = 1;
143 }
144 return(c);
145
146error:
147 *len = 0;
148 return(-1);
149}
150
151/**
152 * xmlCheckUTF8: Check utf-8 string for legality.
153 * @utf: Pointer to putative utf-8 encoded string.
154 *
155 * Checks @utf for being valid utf-8. @utf is assumed to be
156 * null-terminated. This function is not super-strict, as it will
157 * allow longer utf-8 sequences than necessary. Note that Java is
158 * capable of producing these sequences if provoked. Also note, this
159 * routine checks for the 4-byte maxiumum size, but does not check for
160 * 0x10ffff maximum value.
161 *
162 * Return value: true if @utf is valid.
163 **/
164int
165xmlCheckUTF8(const unsigned char *utf)
166{
167 int ix;
168 unsigned char c;
169
170 for (ix = 0; (c = utf[ix]);) {
171 if (c & 0x80) {
172 if ((utf[ix + 1] & 0xc0) != 0x80)
173 return(0);
174 if ((c & 0xe0) == 0xe0) {
175 if ((utf[ix + 2] & 0xc0) != 0x80)
176 return(0);
177 if ((c & 0xf0) == 0xf0) {
178 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
179 return(0);
180 ix += 4;
181 /* 4-byte code */
182 } else
183 /* 3-byte code */
184 ix += 3;
185 } else
186 /* 2-byte code */
187 ix += 2;
188 } else
189 /* 1-byte code */
190 ix++;
191 }
192 return(1);
193}
194
195/**
196 * asciiToUTF8:
197 * @out: a pointer to an array of bytes to store the result
198 * @outlen: the length of @out
199 * @in: a pointer to an array of ASCII chars
200 * @inlen: the length of @in
201 *
202 * Take a block of ASCII chars in and try to convert it to an UTF-8
203 * block of chars out.
204 * Returns 0 if success, or -1 otherwise
205 * The value of @inlen after return is the number of octets consumed
206 * as the return value is positive, else unpredictiable.
207 * The value of @outlen after return is the number of ocetes consumed.
208 */
209int
210asciiToUTF8(unsigned char* out, int *outlen,
211 const unsigned char* in, int *inlen) {
212 unsigned char* outstart = out;
213 const unsigned char* base = in;
214 const unsigned char* processed = in;
215 unsigned char* outend = out + *outlen;
216 const unsigned char* inend;
217 unsigned int c;
218 int bits;
219
220 inend = in + (*inlen);
221 while ((in < inend) && (out - outstart + 5 < *outlen)) {
222 c= *in++;
223
224 /* assertion: c is a single UTF-4 value */
225 if (out >= outend)
226 break;
227 if (c < 0x80) { *out++= c; bits= -6; }
228 else {
229 *outlen = out - outstart;
230 *inlen = processed - base;
231 return(-1);
232 }
233
234 for ( ; bits >= 0; bits-= 6) {
235 if (out >= outend)
236 break;
237 *out++= ((c >> bits) & 0x3F) | 0x80;
238 }
239 processed = (const unsigned char*) in;
240 }
241 *outlen = out - outstart;
242 *inlen = processed - base;
243 return(0);
244}
245
246/**
247 * UTF8Toascii:
248 * @out: a pointer to an array of bytes to store the result
249 * @outlen: the length of @out
250 * @in: a pointer to an array of UTF-8 chars
251 * @inlen: the length of @in
252 *
253 * Take a block of UTF-8 chars in and try to convert it to an ASCII
254 * block of chars out.
255 *
256 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
257 * The value of @inlen after return is the number of octets consumed
258 * as the return value is positive, else unpredictiable.
259 * The value of @outlen after return is the number of ocetes consumed.
260 */
261int
262UTF8Toascii(unsigned char* out, int *outlen,
263 const unsigned char* in, int *inlen) {
264 const unsigned char* processed = in;
265 const unsigned char* outend;
266 const unsigned char* outstart = out;
267 const unsigned char* instart = in;
268 const unsigned char* inend;
269 unsigned int c, d;
270 int trailing;
271
272 if (in == NULL) {
273 /*
274 * initialization nothing to do
275 */
276 *outlen = 0;
277 *inlen = 0;
278 return(0);
279 }
280 inend = in + (*inlen);
281 outend = out + (*outlen);
282 while (in < inend) {
283 d = *in++;
284 if (d < 0x80) { c= d; trailing= 0; }
285 else if (d < 0xC0) {
286 /* trailing byte in leading position */
287 *outlen = out - outstart;
288 *inlen = processed - instart;
289 return(-2);
290 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
291 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
292 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
293 else {
294 /* no chance for this in Ascii */
295 *outlen = out - outstart;
296 *inlen = processed - instart;
297 return(-2);
298 }
299
300 if (inend - in < trailing) {
301 break;
302 }
303
304 for ( ; trailing; trailing--) {
305 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
306 break;
307 c <<= 6;
308 c |= d & 0x3F;
309 }
310
311 /* assertion: c is a single UTF-4 value */
312 if (c < 0x80) {
313 if (out >= outend)
314 break;
315 *out++ = c;
316 } else {
317 /* no chance for this in Ascii */
318 *outlen = out - outstart;
319 *inlen = processed - instart;
320 return(-2);
321 }
322 processed = in;
323 }
324 *outlen = out - outstart;
325 *inlen = processed - instart;
326 return(0);
327}
328
329/**
330 * isolat1ToUTF8:
331 * @out: a pointer to an array of bytes to store the result
332 * @outlen: the length of @out
333 * @in: a pointer to an array of ISO Latin 1 chars
334 * @inlen: the length of @in
335 *
336 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
337 * block of chars out.
338 * Returns 0 if success, or -1 otherwise
339 * The value of @inlen after return is the number of octets consumed
340 * as the return value is positive, else unpredictiable.
341 * The value of @outlen after return is the number of ocetes consumed.
342 */
343int
344isolat1ToUTF8(unsigned char* out, int *outlen,
345 const unsigned char* in, int *inlen) {
346 unsigned char* outstart = out;
347 const unsigned char* base = in;
348 const unsigned char* processed = in;
349 unsigned char* outend = out + *outlen;
350 const unsigned char* inend;
351 unsigned int c;
352 int bits;
353
354 inend = in + (*inlen);
355 while ((in < inend) && (out - outstart + 5 < *outlen)) {
356 c= *in++;
357
358 /* assertion: c is a single UTF-4 value */
359 if (out >= outend)
360 break;
361 if (c < 0x80) { *out++= c; bits= -6; }
362 else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
363
364 for ( ; bits >= 0; bits-= 6) {
365 if (out >= outend)
366 break;
367 *out++= ((c >> bits) & 0x3F) | 0x80;
368 }
369 processed = (const unsigned char*) in;
370 }
371 *outlen = out - outstart;
372 *inlen = processed - base;
373 return(0);
374}
375
376/**
377 * UTF8Toisolat1:
378 * @out: a pointer to an array of bytes to store the result
379 * @outlen: the length of @out
380 * @in: a pointer to an array of UTF-8 chars
381 * @inlen: the length of @in
382 *
383 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
384 * block of chars out.
385 *
386 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
387 * The value of @inlen after return is the number of octets consumed
388 * as the return value is positive, else unpredictiable.
389 * The value of @outlen after return is the number of ocetes consumed.
390 */
391int
392UTF8Toisolat1(unsigned char* out, int *outlen,
393 const unsigned char* in, int *inlen) {
394 const unsigned char* processed = in;
395 const unsigned char* outend;
396 const unsigned char* outstart = out;
397 const unsigned char* instart = in;
398 const unsigned char* inend;
399 unsigned int c, d;
400 int trailing;
401
402 if (in == NULL) {
403 /*
404 * initialization nothing to do
405 */
406 *outlen = 0;
407 *inlen = 0;
408 return(0);
409 }
410 inend = in + (*inlen);
411 outend = out + (*outlen);
412 while (in < inend) {
413 d = *in++;
414 if (d < 0x80) { c= d; trailing= 0; }
415 else if (d < 0xC0) {
416 /* trailing byte in leading position */
417 *outlen = out - outstart;
418 *inlen = processed - instart;
419 return(-2);
420 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
421 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
422 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
423 else {
424 /* no chance for this in IsoLat1 */
425 *outlen = out - outstart;
426 *inlen = processed - instart;
427 return(-2);
428 }
429
430 if (inend - in < trailing) {
431 break;
432 }
433
434 for ( ; trailing; trailing--) {
435 if (in >= inend)
436 break;
437 if (((d= *in++) & 0xC0) != 0x80) {
438 *outlen = out - outstart;
439 *inlen = processed - instart;
440 return(-2);
441 }
442 c <<= 6;
443 c |= d & 0x3F;
444 }
445
446 /* assertion: c is a single UTF-4 value */
447 if (c <= 0xFF) {
448 if (out >= outend)
449 break;
450 *out++ = c;
451 } else {
452 /* no chance for this in IsoLat1 */
453 *outlen = out - outstart;
454 *inlen = processed - instart;
455 return(-2);
456 }
457 processed = in;
458 }
459 *outlen = out - outstart;
460 *inlen = processed - instart;
461 return(0);
462}
463
464/**
465 * UTF16LEToUTF8:
466 * @out: a pointer to an array of bytes to store the result
467 * @outlen: the length of @out
468 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
469 * @inlenb: the length of @in in UTF-16LE chars
470 *
471 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
472 * block of chars out. This function assume the endian properity
473 * is the same between the native type of this machine and the
474 * inputed one.
475 *
476 * Returns the number of byte written, or -1 by lack of space, or -2
477 * if the transcoding fails (for *in is not valid utf16 string)
478 * The value of *inlen after return is the number of octets consumed
479 * as the return value is positive, else unpredictiable.
480 */
481int
482UTF16LEToUTF8(unsigned char* out, int *outlen,
483 const unsigned char* inb, int *inlenb)
484{
485 unsigned char* outstart = out;
486 const unsigned char* processed = inb;
487 unsigned char* outend = out + *outlen;
488 unsigned short* in = (unsigned short*) inb;
489 unsigned short* inend;
490 unsigned int c, d, inlen;
491 unsigned char *tmp;
492 int bits;
493
494 if ((*inlenb % 2) == 1)
495 (*inlenb)--;
496 inlen = *inlenb / 2;
497 inend = in + inlen;
498 while ((in < inend) && (out - outstart + 5 < *outlen)) {
499 if (xmlLittleEndian) {
500 c= *in++;
501 } else {
502 tmp = (unsigned char *) in;
503 c = *tmp++;
504 c = c | (((unsigned int)*tmp) << 8);
505 in++;
506 }
507 if ((c & 0xFC00) == 0xD800) { /* surrogates */
508 if (in >= inend) { /* (in > inend) shouldn't happens */
509 break;
510 }
511 if (xmlLittleEndian) {
512 d = *in++;
513 } else {
514 tmp = (unsigned char *) in;
515 d = *tmp++;
516 d = d | (((unsigned int)*tmp) << 8);
517 in++;
518 }
519 if ((d & 0xFC00) == 0xDC00) {
520 c &= 0x03FF;
521 c <<= 10;
522 c |= d & 0x03FF;
523 c += 0x10000;
524 }
525 else {
526 *outlen = out - outstart;
527 *inlenb = processed - inb;
528 return(-2);
529 }
530 }
531
532 /* assertion: c is a single UTF-4 value */
533 if (out >= outend)
534 break;
535 if (c < 0x80) { *out++= c; bits= -6; }
536 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
537 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
538 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
539
540 for ( ; bits >= 0; bits-= 6) {
541 if (out >= outend)
542 break;
543 *out++= ((c >> bits) & 0x3F) | 0x80;
544 }
545 processed = (const unsigned char*) in;
546 }
547 *outlen = out - outstart;
548 *inlenb = processed - inb;
549 return(0);
550}
551
552/**
553 * UTF8ToUTF16LE:
554 * @outb: a pointer to an array of bytes to store the result
555 * @outlen: the length of @outb
556 * @in: a pointer to an array of UTF-8 chars
557 * @inlen: the length of @in
558 *
559 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
560 * block of chars out.
561 *
562 * Returns the number of byte written, or -1 by lack of space, or -2
563 * if the transcoding failed.
564 */
565int
566UTF8ToUTF16LE(unsigned char* outb, int *outlen,
567 const unsigned char* in, int *inlen)
568{
569 unsigned short* out = (unsigned short*) outb;
570 const unsigned char* processed = in;
571 unsigned short* outstart= out;
572 unsigned short* outend;
573 const unsigned char* inend= in+*inlen;
574 unsigned int c, d;
575 int trailing;
576 unsigned char *tmp;
577 unsigned short tmp1, tmp2;
578
579 if (in == NULL) {
580 /*
581 * initialization, add the Byte Order Mark
582 */
583 if (*outlen >= 2) {
584 outb[0] = 0xFF;
585 outb[1] = 0xFE;
586 *outlen = 2;
587 *inlen = 0;
588#ifdef DEBUG_ENCODING
589 xmlGenericError(xmlGenericErrorContext,
590 "Added FFFE Byte Order Mark\n");
591#endif
592 return(2);
593 }
594 *outlen = 0;
595 *inlen = 0;
596 return(0);
597 }
598 outend = out + (*outlen / 2);
599 while (in < inend) {
600 d= *in++;
601 if (d < 0x80) { c= d; trailing= 0; }
602 else if (d < 0xC0) {
603 /* trailing byte in leading position */
604 *outlen = (out - outstart) * 2;
605 *inlen = processed - in;
606 return(-2);
607 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
608 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
609 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
610 else {
611 /* no chance for this in UTF-16 */
612 *outlen = (out - outstart) * 2;
613 *inlen = processed - in;
614 return(-2);
615 }
616
617 if (inend - in < trailing) {
618 break;
619 }
620
621 for ( ; trailing; trailing--) {
622 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
623 break;
624 c <<= 6;
625 c |= d & 0x3F;
626 }
627
628 /* assertion: c is a single UTF-4 value */
629 if (c < 0x10000) {
630 if (out >= outend)
631 break;
632 if (xmlLittleEndian) {
633 *out++ = c;
634 } else {
635 tmp = (unsigned char *) out;
636 *tmp = c ;
637 *(tmp + 1) = c >> 8 ;
638 out++;
639 }
640 }
641 else if (c < 0x110000) {
642 if (out+1 >= outend)
643 break;
644 c -= 0x10000;
645 if (xmlLittleEndian) {
646 *out++ = 0xD800 | (c >> 10);
647 *out++ = 0xDC00 | (c & 0x03FF);
648 } else {
649 tmp1 = 0xD800 | (c >> 10);
650 tmp = (unsigned char *) out;
651 *tmp = (unsigned char) tmp1;
652 *(tmp + 1) = tmp1 >> 8;
653 out++;
654
655 tmp2 = 0xDC00 | (c & 0x03FF);
656 tmp = (unsigned char *) out;
657 *tmp = (unsigned char) tmp2;
658 *(tmp + 1) = tmp2 >> 8;
659 out++;
660 }
661 }
662 else
663 break;
664 processed = in;
665 }
666 *outlen = (out - outstart) * 2;
667 *inlen = processed - in;
668 return(0);
669}
670
671/**
672 * UTF16BEToUTF8:
673 * @out: a pointer to an array of bytes to store the result
674 * @outlen: the length of @out
675 * @inb: a pointer to an array of UTF-16 passwd as a byte array
676 * @inlenb: the length of @in in UTF-16 chars
677 *
678 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
679 * block of chars out. This function assume the endian properity
680 * is the same between the native type of this machine and the
681 * inputed one.
682 *
683 * Returns the number of byte written, or -1 by lack of space, or -2
684 * if the transcoding fails (for *in is not valid utf16 string)
685 * The value of *inlen after return is the number of octets consumed
686 * as the return value is positive, else unpredictiable.
687 */
688int
689UTF16BEToUTF8(unsigned char* out, int *outlen,
690 const unsigned char* inb, int *inlenb)
691{
692 unsigned char* outstart = out;
693 const unsigned char* processed = inb;
694 unsigned char* outend = out + *outlen;
695 unsigned short* in = (unsigned short*) inb;
696 unsigned short* inend;
697 unsigned int c, d, inlen;
698 unsigned char *tmp;
699 int bits;
700
701 if ((*inlenb % 2) == 1)
702 (*inlenb)--;
703 inlen = *inlenb / 2;
704 inend= in + inlen;
705 while (in < inend) {
706 if (xmlLittleEndian) {
707 tmp = (unsigned char *) in;
708 c = *tmp++;
709 c = c << 8;
710 c = c | (unsigned int) *tmp;
711 in++;
712 } else {
713 c= *in++;
714 }
715 if ((c & 0xFC00) == 0xD800) { /* surrogates */
716 if (in >= inend) { /* (in > inend) shouldn't happens */
717 *outlen = out - outstart;
718 *inlenb = processed - inb;
719 return(-2);
720 }
721 if (xmlLittleEndian) {
722 tmp = (unsigned char *) in;
723 d = *tmp++;
724 d = d << 8;
725 d = d | (unsigned int) *tmp;
726 in++;
727 } else {
728 d= *in++;
729 }
730 if ((d & 0xFC00) == 0xDC00) {
731 c &= 0x03FF;
732 c <<= 10;
733 c |= d & 0x03FF;
734 c += 0x10000;
735 }
736 else {
737 *outlen = out - outstart;
738 *inlenb = processed - inb;
739 return(-2);
740 }
741 }
742
743 /* assertion: c is a single UTF-4 value */
744 if (out >= outend)
745 break;
746 if (c < 0x80) { *out++= c; bits= -6; }
747 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
748 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
749 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
750
751 for ( ; bits >= 0; bits-= 6) {
752 if (out >= outend)
753 break;
754 *out++= ((c >> bits) & 0x3F) | 0x80;
755 }
756 processed = (const unsigned char*) in;
757 }
758 *outlen = out - outstart;
759 *inlenb = processed - inb;
760 return(0);
761}
762
763/**
764 * UTF8ToUTF16BE:
765 * @outb: a pointer to an array of bytes to store the result
766 * @outlen: the length of @outb
767 * @in: a pointer to an array of UTF-8 chars
768 * @inlen: the length of @in
769 *
770 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
771 * block of chars out.
772 *
773 * Returns the number of byte written, or -1 by lack of space, or -2
774 * if the transcoding failed.
775 */
776int
777UTF8ToUTF16BE(unsigned char* outb, int *outlen,
778 const unsigned char* in, int *inlen)
779{
780 unsigned short* out = (unsigned short*) outb;
781 const unsigned char* processed = in;
782 unsigned short* outstart= out;
783 unsigned short* outend;
784 const unsigned char* inend= in+*inlen;
785 unsigned int c, d;
786 int trailing;
787 unsigned char *tmp;
788 unsigned short tmp1, tmp2;
789
790 if (in == NULL) {
791 /*
792 * initialization, add the Byte Order Mark
793 */
794 if (*outlen >= 2) {
795 outb[0] = 0xFE;
796 outb[1] = 0xFF;
797 *outlen = 2;
798 *inlen = 0;
799#ifdef DEBUG_ENCODING
800 xmlGenericError(xmlGenericErrorContext,
801 "Added FEFF Byte Order Mark\n");
802#endif
803 return(2);
804 }
805 *outlen = 0;
806 *inlen = 0;
807 return(0);
808 }
809 outend = out + (*outlen / 2);
810 while (in < inend) {
811 d= *in++;
812 if (d < 0x80) { c= d; trailing= 0; }
813 else if (d < 0xC0) {
814 /* trailing byte in leading position */
815 *outlen = out - outstart;
816 *inlen = processed - in;
817 return(-2);
818 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
819 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
820 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
821 else {
822 /* no chance for this in UTF-16 */
823 *outlen = out - outstart;
824 *inlen = processed - in;
825 return(-2);
826 }
827
828 if (inend - in < trailing) {
829 break;
830 }
831
832 for ( ; trailing; trailing--) {
833 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
834 c <<= 6;
835 c |= d & 0x3F;
836 }
837
838 /* assertion: c is a single UTF-4 value */
839 if (c < 0x10000) {
840 if (out >= outend) break;
841 if (xmlLittleEndian) {
842 tmp = (unsigned char *) out;
843 *tmp = c >> 8;
844 *(tmp + 1) = c;
845 out++;
846 } else {
847 *out++ = c;
848 }
849 }
850 else if (c < 0x110000) {
851 if (out+1 >= outend) break;
852 c -= 0x10000;
853 if (xmlLittleEndian) {
854 tmp1 = 0xD800 | (c >> 10);
855 tmp = (unsigned char *) out;
856 *tmp = tmp1 >> 8;
857 *(tmp + 1) = (unsigned char) tmp1;
858 out++;
859
860 tmp2 = 0xDC00 | (c & 0x03FF);
861 tmp = (unsigned char *) out;
862 *tmp = tmp2 >> 8;
863 *(tmp + 1) = (unsigned char) tmp2;
864 out++;
865 } else {
866 *out++ = 0xD800 | (c >> 10);
867 *out++ = 0xDC00 | (c & 0x03FF);
868 }
869 }
870 else
871 break;
872 processed = in;
873 }
874 *outlen = (out - outstart) * 2;
875 *inlen = processed - in;
876 return(0);
877}
878
879/**
880 * xmlDetectCharEncoding:
881 * @in: a pointer to the first bytes of the XML entity, must be at least
882 * 4 bytes long.
883 * @len: pointer to the length of the buffer
884 *
885 * Guess the encoding of the entity using the first bytes of the entity content
886 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
887 *
888 * Returns one of the XML_CHAR_ENCODING_... values.
889 */
890xmlCharEncoding
891xmlDetectCharEncoding(const unsigned char* in, int len)
892{
893 if (len >= 4) {
894 if ((in[0] == 0x00) && (in[1] == 0x00) &&
895 (in[2] == 0x00) && (in[3] == 0x3C))
896 return(XML_CHAR_ENCODING_UCS4BE);
897 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
898 (in[2] == 0x00) && (in[3] == 0x00))
899 return(XML_CHAR_ENCODING_UCS4LE);
900 if ((in[0] == 0x00) && (in[1] == 0x00) &&
901 (in[2] == 0x3C) && (in[3] == 0x00))
902 return(XML_CHAR_ENCODING_UCS4_2143);
903 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
904 (in[2] == 0x00) && (in[3] == 0x00))
905 return(XML_CHAR_ENCODING_UCS4_3412);
906 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
907 (in[2] == 0xA7) && (in[3] == 0x94))
908 return(XML_CHAR_ENCODING_EBCDIC);
909 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
910 (in[2] == 0x78) && (in[3] == 0x6D))
911 return(XML_CHAR_ENCODING_UTF8);
912 }
913 if (len >= 2) {
914 if ((in[0] == 0xFE) && (in[1] == 0xFF))
915 return(XML_CHAR_ENCODING_UTF16BE);
916 if ((in[0] == 0xFF) && (in[1] == 0xFE))
917 return(XML_CHAR_ENCODING_UTF16LE);
918 }
919 return(XML_CHAR_ENCODING_NONE);
920}
921
922/**
923 * xmlCleanupEncodingAliases:
924 *
925 * Unregisters all aliases
926 */
927void
928xmlCleanupEncodingAliases(void) {
929 int i;
930
931 if (xmlCharEncodingAliases == NULL)
932 return;
933
934 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
935 if (xmlCharEncodingAliases[i].name != NULL)
936 xmlFree((char *) xmlCharEncodingAliases[i].name);
937 if (xmlCharEncodingAliases[i].alias != NULL)
938 xmlFree((char *) xmlCharEncodingAliases[i].alias);
939 }
940 xmlCharEncodingAliasesNb = 0;
941 xmlCharEncodingAliasesMax = 0;
942 xmlFree(xmlCharEncodingAliases);
943}
944
945/**
946 * xmlGetEncodingAlias:
947 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
948 *
949 * Lookup an encoding name for the given alias.
950 *
951 * Returns NULL if not found the original name otherwise
952 */
953const char *
954xmlGetEncodingAlias(const char *alias) {
955 int i;
956 char upper[100];
957
958 if (alias == NULL)
959 return(NULL);
960
961 if (xmlCharEncodingAliases == NULL)
962 return(NULL);
963
964 for (i = 0;i < 99;i++) {
965 upper[i] = toupper(alias[i]);
966 if (upper[i] == 0) break;
967 }
968 upper[i] = 0;
969
970 /*
971 * Walk down the list looking for a definition of the alias
972 */
973 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
974 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
975 return(xmlCharEncodingAliases[i].name);
976 }
977 }
978 return(NULL);
979}
980
981/**
982 * xmlAddEncodingAlias:
983 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
984 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
985 *
986 * Registers and alias @alias for an encoding named @name. Existing alias
987 * will be overwritten.
988 *
989 * Returns 0 in case of success, -1 in case of error
990 */
991int
992xmlAddEncodingAlias(const char *name, const char *alias) {
993 int i;
994 char upper[100];
995
996 if ((name == NULL) || (alias == NULL))
997 return(-1);
998
999 for (i = 0;i < 99;i++) {
1000 upper[i] = toupper(alias[i]);
1001 if (upper[i] == 0) break;
1002 }
1003 upper[i] = 0;
1004
1005 if (xmlCharEncodingAliases == NULL) {
1006 xmlCharEncodingAliasesNb = 0;
1007 xmlCharEncodingAliasesMax = 20;
1008 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1009 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1010 if (xmlCharEncodingAliases == NULL)
1011 return(-1);
1012 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1013 xmlCharEncodingAliasesMax *= 2;
1014 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1015 xmlRealloc(xmlCharEncodingAliases,
1016 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1017 }
1018 /*
1019 * Walk down the list looking for a definition of the alias
1020 */
1021 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1022 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1023 /*
1024 * Replace the definition.
1025 */
1026 xmlFree((char *) xmlCharEncodingAliases[i].name);
1027 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1028 return(0);
1029 }
1030 }
1031 /*
1032 * Add the definition
1033 */
1034 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1035 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1036 xmlCharEncodingAliasesNb++;
1037 return(0);
1038}
1039
1040/**
1041 * xmlDelEncodingAlias:
1042 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1043 *
1044 * Unregisters an encoding alias @alias
1045 *
1046 * Returns 0 in case of success, -1 in case of error
1047 */
1048int
1049xmlDelEncodingAlias(const char *alias) {
1050 int i;
1051
1052 if (alias == NULL)
1053 return(-1);
1054
1055 if (xmlCharEncodingAliases == NULL)
1056 return(-1);
1057 /*
1058 * Walk down the list looking for a definition of the alias
1059 */
1060 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1061 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1062 xmlFree((char *) xmlCharEncodingAliases[i].name);
1063 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1064 xmlCharEncodingAliasesNb--;
1065 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1066 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1067 return(0);
1068 }
1069 }
1070 return(-1);
1071}
1072
1073/**
1074 * xmlParseCharEncoding:
1075 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1076 *
1077 * Conpare the string to the known encoding schemes already known. Note
1078 * that the comparison is case insensitive accordingly to the section
1079 * [XML] 4.3.3 Character Encoding in Entities.
1080 *
1081 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1082 * if not recognized.
1083 */
1084xmlCharEncoding
1085xmlParseCharEncoding(const char* name)
1086{
1087 const char *alias;
1088 char upper[500];
1089 int i;
1090
1091 if (name == NULL)
1092 return(XML_CHAR_ENCODING_NONE);
1093
1094 /*
1095 * Do the alias resolution
1096 */
1097 alias = xmlGetEncodingAlias(name);
1098 if (alias != NULL)
1099 name = alias;
1100
1101 for (i = 0;i < 499;i++) {
1102 upper[i] = toupper(name[i]);
1103 if (upper[i] == 0) break;
1104 }
1105 upper[i] = 0;
1106
1107 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1108 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1109 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1110
1111 /*
1112 * NOTE: if we were able to parse this, the endianness of UTF16 is
1113 * already found and in use
1114 */
1115 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1116 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1117
1118 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1119 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1120 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1121
1122 /*
1123 * NOTE: if we were able to parse this, the endianness of UCS4 is
1124 * already found and in use
1125 */
1126 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1127 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1128 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1129
1130
1131 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1132 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1133 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1134
1135 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1136 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1137 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1138
1139 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1140 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1141 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1142 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1143 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1144 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1145 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1146
1147 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1148 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1149 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1150
1151#ifdef DEBUG_ENCODING
1152 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1153#endif
1154 return(XML_CHAR_ENCODING_ERROR);
1155}
1156
1157/**
1158 * xmlGetCharEncodingName:
1159 * @enc: the encoding
1160 *
1161 * The "canonical" name for XML encoding.
1162 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1163 * Section 4.3.3 Character Encoding in Entities
1164 *
1165 * Returns the canonical name for the given encoding
1166 */
1167
1168const char*
1169xmlGetCharEncodingName(xmlCharEncoding enc) {
1170 switch (enc) {
1171 case XML_CHAR_ENCODING_ERROR:
1172 return(NULL);
1173 case XML_CHAR_ENCODING_NONE:
1174 return(NULL);
1175 case XML_CHAR_ENCODING_UTF8:
1176 return("UTF-8");
1177 case XML_CHAR_ENCODING_UTF16LE:
1178 return("UTF-16");
1179 case XML_CHAR_ENCODING_UTF16BE:
1180 return("UTF-16");
1181 case XML_CHAR_ENCODING_EBCDIC:
1182 return("EBCDIC");
1183 case XML_CHAR_ENCODING_UCS4LE:
1184 return("ISO-10646-UCS-4");
1185 case XML_CHAR_ENCODING_UCS4BE:
1186 return("ISO-10646-UCS-4");
1187 case XML_CHAR_ENCODING_UCS4_2143:
1188 return("ISO-10646-UCS-4");
1189 case XML_CHAR_ENCODING_UCS4_3412:
1190 return("ISO-10646-UCS-4");
1191 case XML_CHAR_ENCODING_UCS2:
1192 return("ISO-10646-UCS-2");
1193 case XML_CHAR_ENCODING_8859_1:
1194 return("ISO-8859-1");
1195 case XML_CHAR_ENCODING_8859_2:
1196 return("ISO-8859-2");
1197 case XML_CHAR_ENCODING_8859_3:
1198 return("ISO-8859-3");
1199 case XML_CHAR_ENCODING_8859_4:
1200 return("ISO-8859-4");
1201 case XML_CHAR_ENCODING_8859_5:
1202 return("ISO-8859-5");
1203 case XML_CHAR_ENCODING_8859_6:
1204 return("ISO-8859-6");
1205 case XML_CHAR_ENCODING_8859_7:
1206 return("ISO-8859-7");
1207 case XML_CHAR_ENCODING_8859_8:
1208 return("ISO-8859-8");
1209 case XML_CHAR_ENCODING_8859_9:
1210 return("ISO-8859-9");
1211 case XML_CHAR_ENCODING_2022_JP:
1212 return("ISO-2022-JP");
1213 case XML_CHAR_ENCODING_SHIFT_JIS:
1214 return("Shift-JIS");
1215 case XML_CHAR_ENCODING_EUC_JP:
1216 return("EUC-JP");
1217 case XML_CHAR_ENCODING_ASCII:
1218 return(NULL);
1219 }
1220 return(NULL);
1221}
1222
1223/****************************************************************
1224 * *
1225 * Char encoding handlers *
1226 * *
1227 ****************************************************************/
1228
1229/* the size should be growable, but it's not a big deal ... */
1230#define MAX_ENCODING_HANDLERS 50
1231static xmlCharEncodingHandlerPtr *handlers = NULL;
1232static int nbCharEncodingHandler = 0;
1233
1234/*
1235 * The default is UTF-8 for XML, that's also the default used for the
1236 * parser internals, so the default encoding handler is NULL
1237 */
1238
1239static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1240
1241/**
1242 * xmlNewCharEncodingHandler:
1243 * @name: the encoding name, in UTF-8 format (ASCII actually)
1244 * @input: the xmlCharEncodingInputFunc to read that encoding
1245 * @output: the xmlCharEncodingOutputFunc to write that encoding
1246 *
1247 * Create and registers an xmlCharEncodingHandler.
1248 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1249 */
1250xmlCharEncodingHandlerPtr
1251xmlNewCharEncodingHandler(const char *name,
1252 xmlCharEncodingInputFunc input,
1253 xmlCharEncodingOutputFunc output) {
1254 xmlCharEncodingHandlerPtr handler;
1255 const char *alias;
1256 char upper[500];
1257 int i;
1258 char *up = 0;
1259
1260 /*
1261 * Do the alias resolution
1262 */
1263 alias = xmlGetEncodingAlias(name);
1264 if (alias != NULL)
1265 name = alias;
1266
1267 /*
1268 * Keep only the uppercase version of the encoding.
1269 */
1270 if (name == NULL) {
1271 xmlGenericError(xmlGenericErrorContext,
1272 "xmlNewCharEncodingHandler : no name !\n");
1273 return(NULL);
1274 }
1275 for (i = 0;i < 499;i++) {
1276 upper[i] = toupper(name[i]);
1277 if (upper[i] == 0) break;
1278 }
1279 upper[i] = 0;
1280 up = xmlMemStrdup(upper);
1281 if (up == NULL) {
1282 xmlGenericError(xmlGenericErrorContext,
1283 "xmlNewCharEncodingHandler : out of memory !\n");
1284 return(NULL);
1285 }
1286
1287 /*
1288 * allocate and fill-up an handler block.
1289 */
1290 handler = (xmlCharEncodingHandlerPtr)
1291 xmlMalloc(sizeof(xmlCharEncodingHandler));
1292 if (handler == NULL) {
1293 xmlGenericError(xmlGenericErrorContext,
1294 "xmlNewCharEncodingHandler : out of memory !\n");
1295 return(NULL);
1296 }
1297 handler->input = input;
1298 handler->output = output;
1299 handler->name = up;
1300
1301#ifdef LIBXML_ICONV_ENABLED
1302 handler->iconv_in = NULL;
1303 handler->iconv_out = NULL;
1304#endif /* LIBXML_ICONV_ENABLED */
1305
1306 /*
1307 * registers and returns the handler.
1308 */
1309 xmlRegisterCharEncodingHandler(handler);
1310#ifdef DEBUG_ENCODING
1311 xmlGenericError(xmlGenericErrorContext,
1312 "Registered encoding handler for %s\n", name);
1313#endif
1314 return(handler);
1315}
1316
1317/**
1318 * xmlInitCharEncodingHandlers:
1319 *
1320 * Initialize the char encoding support, it registers the default
1321 * encoding supported.
1322 * NOTE: while public, this function usually doesn't need to be called
1323 * in normal processing.
1324 */
1325void
1326xmlInitCharEncodingHandlers(void) {
1327 unsigned short int tst = 0x1234;
1328 unsigned char *ptr = (unsigned char *) &tst;
1329
1330 if (handlers != NULL) return;
1331
1332 handlers = (xmlCharEncodingHandlerPtr *)
1333 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1334
1335 if (*ptr == 0x12) xmlLittleEndian = 0;
1336 else if (*ptr == 0x34) xmlLittleEndian = 1;
1337 else xmlGenericError(xmlGenericErrorContext,
1338 "Odd problem at endianness detection\n");
1339
1340 if (handlers == NULL) {
1341 xmlGenericError(xmlGenericErrorContext,
1342 "xmlInitCharEncodingHandlers : out of memory !\n");
1343 return;
1344 }
1345 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1346 xmlUTF16LEHandler =
1347 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1348 xmlUTF16BEHandler =
1349 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1350 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1351 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1352#ifdef LIBXML_HTML_ENABLED
1353 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1354#endif
1355}
1356
1357/**
1358 * xmlCleanupCharEncodingHandlers:
1359 *
1360 * Cleanup the memory allocated for the char encoding support, it
1361 * unregisters all the encoding handlers and the aliases.
1362 */
1363void
1364xmlCleanupCharEncodingHandlers(void) {
1365 xmlCleanupEncodingAliases();
1366
1367 if (handlers == NULL) return;
1368
1369 for (;nbCharEncodingHandler > 0;) {
1370 nbCharEncodingHandler--;
1371 if (handlers[nbCharEncodingHandler] != NULL) {
1372 if (handlers[nbCharEncodingHandler]->name != NULL)
1373 xmlFree(handlers[nbCharEncodingHandler]->name);
1374 xmlFree(handlers[nbCharEncodingHandler]);
1375 }
1376 }
1377 xmlFree(handlers);
1378 handlers = NULL;
1379 nbCharEncodingHandler = 0;
1380 xmlDefaultCharEncodingHandler = NULL;
1381}
1382
1383/**
1384 * xmlRegisterCharEncodingHandler:
1385 * @handler: the xmlCharEncodingHandlerPtr handler block
1386 *
1387 * Register the char encoding handler, surprizing, isn't it ?
1388 */
1389void
1390xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1391 if (handlers == NULL) xmlInitCharEncodingHandlers();
1392 if (handler == NULL) {
1393 xmlGenericError(xmlGenericErrorContext,
1394 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1395 return;
1396 }
1397
1398 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1399 xmlGenericError(xmlGenericErrorContext,
1400 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1401 xmlGenericError(xmlGenericErrorContext,
1402 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1403 return;
1404 }
1405 handlers[nbCharEncodingHandler++] = handler;
1406}
1407
1408/**
1409 * xmlGetCharEncodingHandler:
1410 * @enc: an xmlCharEncoding value.
1411 *
1412 * Search in the registrered set the handler able to read/write that encoding.
1413 *
1414 * Returns the handler or NULL if not found
1415 */
1416xmlCharEncodingHandlerPtr
1417xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1418 xmlCharEncodingHandlerPtr handler;
1419
1420 if (handlers == NULL) xmlInitCharEncodingHandlers();
1421 switch (enc) {
1422 case XML_CHAR_ENCODING_ERROR:
1423 return(NULL);
1424 case XML_CHAR_ENCODING_NONE:
1425 return(NULL);
1426 case XML_CHAR_ENCODING_UTF8:
1427 return(NULL);
1428 case XML_CHAR_ENCODING_UTF16LE:
1429 return(xmlUTF16LEHandler);
1430 case XML_CHAR_ENCODING_UTF16BE:
1431 return(xmlUTF16BEHandler);
1432 case XML_CHAR_ENCODING_EBCDIC:
1433 handler = xmlFindCharEncodingHandler("EBCDIC");
1434 if (handler != NULL) return(handler);
1435 handler = xmlFindCharEncodingHandler("ebcdic");
1436 if (handler != NULL) return(handler);
1437 break;
1438 case XML_CHAR_ENCODING_UCS4BE:
1439 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1440 if (handler != NULL) return(handler);
1441 handler = xmlFindCharEncodingHandler("UCS-4");
1442 if (handler != NULL) return(handler);
1443 handler = xmlFindCharEncodingHandler("UCS4");
1444 if (handler != NULL) return(handler);
1445 break;
1446 case XML_CHAR_ENCODING_UCS4LE:
1447 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1448 if (handler != NULL) return(handler);
1449 handler = xmlFindCharEncodingHandler("UCS-4");
1450 if (handler != NULL) return(handler);
1451 handler = xmlFindCharEncodingHandler("UCS4");
1452 if (handler != NULL) return(handler);
1453 break;
1454 case XML_CHAR_ENCODING_UCS4_2143:
1455 break;
1456 case XML_CHAR_ENCODING_UCS4_3412:
1457 break;
1458 case XML_CHAR_ENCODING_UCS2:
1459 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1460 if (handler != NULL) return(handler);
1461 handler = xmlFindCharEncodingHandler("UCS-2");
1462 if (handler != NULL) return(handler);
1463 handler = xmlFindCharEncodingHandler("UCS2");
1464 if (handler != NULL) return(handler);
1465 break;
1466
1467 /*
1468 * We used to keep ISO Latin encodings native in the
1469 * generated data. This led to so many problems that
1470 * this has been removed. One can still change this
1471 * back by registering no-ops encoders for those
1472 */
1473 case XML_CHAR_ENCODING_8859_1:
1474 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1475 if (handler != NULL) return(handler);
1476 break;
1477 case XML_CHAR_ENCODING_8859_2:
1478 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1479 if (handler != NULL) return(handler);
1480 break;
1481 case XML_CHAR_ENCODING_8859_3:
1482 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1483 if (handler != NULL) return(handler);
1484 break;
1485 case XML_CHAR_ENCODING_8859_4:
1486 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1487 if (handler != NULL) return(handler);
1488 break;
1489 case XML_CHAR_ENCODING_8859_5:
1490 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1491 if (handler != NULL) return(handler);
1492 break;
1493 case XML_CHAR_ENCODING_8859_6:
1494 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1495 if (handler != NULL) return(handler);
1496 break;
1497 case XML_CHAR_ENCODING_8859_7:
1498 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1499 if (handler != NULL) return(handler);
1500 break;
1501 case XML_CHAR_ENCODING_8859_8:
1502 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1503 if (handler != NULL) return(handler);
1504 break;
1505 case XML_CHAR_ENCODING_8859_9:
1506 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1507 if (handler != NULL) return(handler);
1508 break;
1509
1510
1511 case XML_CHAR_ENCODING_2022_JP:
1512 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1513 if (handler != NULL) return(handler);
1514 break;
1515 case XML_CHAR_ENCODING_SHIFT_JIS:
1516 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1517 if (handler != NULL) return(handler);
1518 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1519 if (handler != NULL) return(handler);
1520 handler = xmlFindCharEncodingHandler("Shift_JIS");
1521 if (handler != NULL) return(handler);
1522 break;
1523 case XML_CHAR_ENCODING_EUC_JP:
1524 handler = xmlFindCharEncodingHandler("EUC-JP");
1525 if (handler != NULL) return(handler);
1526 break;
1527 default:
1528 break;
1529 }
1530
1531#ifdef DEBUG_ENCODING
1532 xmlGenericError(xmlGenericErrorContext,
1533 "No handler found for encoding %d\n", enc);
1534#endif
1535 return(NULL);
1536}
1537
1538/**
1539 * xmlGetCharEncodingHandler:
1540 * @enc: a string describing the char encoding.
1541 *
1542 * Search in the registrered set the handler able to read/write that encoding.
1543 *
1544 * Returns the handler or NULL if not found
1545 */
1546xmlCharEncodingHandlerPtr
1547xmlFindCharEncodingHandler(const char *name) {
1548 const char *nalias;
1549 const char *norig;
1550 xmlCharEncoding alias;
1551#ifdef LIBXML_ICONV_ENABLED
1552 xmlCharEncodingHandlerPtr enc;
1553 iconv_t icv_in, icv_out;
1554#endif /* LIBXML_ICONV_ENABLED */
1555 char upper[100];
1556 int i;
1557
1558 if (handlers == NULL) xmlInitCharEncodingHandlers();
1559 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1560 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1561
1562 /*
1563 * Do the alias resolution
1564 */
1565 norig = name;
1566 nalias = xmlGetEncodingAlias(name);
1567 if (nalias != NULL)
1568 name = nalias;
1569
1570 /*
1571 * Check first for directly registered encoding names
1572 */
1573 for (i = 0;i < 99;i++) {
1574 upper[i] = toupper(name[i]);
1575 if (upper[i] == 0) break;
1576 }
1577 upper[i] = 0;
1578
1579 for (i = 0;i < nbCharEncodingHandler; i++)
1580 if (!strcmp(upper, handlers[i]->name)) {
1581#ifdef DEBUG_ENCODING
1582 xmlGenericError(xmlGenericErrorContext,
1583 "Found registered handler for encoding %s\n", name);
1584#endif
1585 return(handlers[i]);
1586 }
1587
1588#ifdef LIBXML_ICONV_ENABLED
1589 /* check whether iconv can handle this */
1590 icv_in = iconv_open("UTF-8", name);
1591 icv_out = iconv_open(name, "UTF-8");
1592 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1593 enc = (xmlCharEncodingHandlerPtr)
1594 xmlMalloc(sizeof(xmlCharEncodingHandler));
1595 if (enc == NULL) {
1596 iconv_close(icv_in);
1597 iconv_close(icv_out);
1598 return(NULL);
1599 }
1600 enc->name = xmlMemStrdup(name);
1601 enc->input = NULL;
1602 enc->output = NULL;
1603 enc->iconv_in = icv_in;
1604 enc->iconv_out = icv_out;
1605#ifdef DEBUG_ENCODING
1606 xmlGenericError(xmlGenericErrorContext,
1607 "Found iconv handler for encoding %s\n", name);
1608#endif
1609 return enc;
1610 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1611 xmlGenericError(xmlGenericErrorContext,
1612 "iconv : problems with filters for '%s'\n", name);
1613 }
1614#endif /* LIBXML_ICONV_ENABLED */
1615
1616#ifdef DEBUG_ENCODING
1617 xmlGenericError(xmlGenericErrorContext,
1618 "No handler found for encoding %s\n", name);
1619#endif
1620
1621 /*
1622 * Fallback using the canonical names
1623 */
1624 alias = xmlParseCharEncoding(norig);
1625 if (alias != XML_CHAR_ENCODING_ERROR) {
1626 const char* canon;
1627 canon = xmlGetCharEncodingName(alias);
1628 if ((canon != NULL) && (strcmp(name, canon))) {
1629 return(xmlFindCharEncodingHandler(canon));
1630 }
1631 }
1632
1633 return(NULL);
1634}
1635
1636#ifdef LIBXML_ICONV_ENABLED
1637/**
1638 * xmlIconvWrapper:
1639 * @cd: iconv converter data structure
1640 * @out: a pointer to an array of bytes to store the result
1641 * @outlen: the length of @out
1642 * @in: a pointer to an array of ISO Latin 1 chars
1643 * @inlen: the length of @in
1644 *
1645 * Returns 0 if success, or
1646 * -1 by lack of space, or
1647 * -2 if the transcoding fails (for *in is not valid utf8 string or
1648 * the result of transformation can't fit into the encoding we want), or
1649 * -3 if there the last byte can't form a single output char.
1650 *
1651 * The value of @inlen after return is the number of octets consumed
1652 * as the return value is positive, else unpredictiable.
1653 * The value of @outlen after return is the number of ocetes consumed.
1654 */
1655static int
1656xmlIconvWrapper(iconv_t cd,
1657 unsigned char *out, int *outlen,
1658 const unsigned char *in, int *inlen) {
1659
1660 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1661 const char *icv_in = (const char *) in;
1662 char *icv_out = (char *) out;
1663 int ret;
1664
1665 ret = iconv(cd,
1666 &icv_in, &icv_inlen,
1667 &icv_out, &icv_outlen);
1668 if (in != NULL) {
1669 *inlen -= icv_inlen;
1670 *outlen -= icv_outlen;
1671 } else {
1672 *inlen = 0;
1673 *outlen = 0;
1674 }
1675 if (icv_inlen != 0 || ret == (size_t) -1) {
1676#ifdef EILSEQ
1677 if (errno == EILSEQ) {
1678 return -2;
1679 } else
1680#endif
1681#ifdef E2BIG
1682 if (errno == E2BIG) {
1683 return -1;
1684 } else
1685#endif
1686#ifdef EINVAL
1687 if (errno == EINVAL) {
1688 return -3;
1689 } else
1690#endif
1691 {
1692 return -3;
1693 }
1694 }
1695 return 0;
1696}
1697#endif /* LIBXML_ICONV_ENABLED */
1698
1699/**
1700 * xmlCharEncFirstLine:
1701 * @handler: char enconding transformation data structure
1702 * @out: an xmlBuffer for the output.
1703 * @in: an xmlBuffer for the input
1704 *
1705 * Front-end for the encoding handler input function, but handle only
1706 * the very first line, i.e. limit itself to 45 chars.
1707 *
1708 * Returns the number of byte written if success, or
1709 * -1 general error
1710 * -2 if the transcoding fails (for *in is not valid utf8 string or
1711 * the result of transformation can't fit into the encoding we want), or
1712 */
1713int
1714xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1715 xmlBufferPtr in) {
1716 int ret = -2;
1717 int written;
1718 int toconv;
1719
1720 if (handler == NULL) return(-1);
1721 if (out == NULL) return(-1);
1722 if (in == NULL) return(-1);
1723
1724 written = out->size - out->use;
1725 toconv = in->use;
1726 if (toconv * 2 >= written) {
1727 xmlBufferGrow(out, toconv);
1728 written = out->size - out->use - 1;
1729 }
1730
1731 /*
1732 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1733 * 45 chars should be sufficient to reach the end of the encoding
1734 * decalration without going too far inside the document content.
1735 */
1736 written = 45;
1737
1738 if (handler->input != NULL) {
1739 ret = handler->input(&out->content[out->use], &written,
1740 in->content, &toconv);
1741 xmlBufferShrink(in, toconv);
1742 out->use += written;
1743 out->content[out->use] = 0;
1744 }
1745#ifdef LIBXML_ICONV_ENABLED
1746 else if (handler->iconv_in != NULL) {
1747 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1748 &written, in->content, &toconv);
1749 xmlBufferShrink(in, toconv);
1750 out->use += written;
1751 out->content[out->use] = 0;
1752 if (ret == -1) ret = -3;
1753 }
1754#endif /* LIBXML_ICONV_ENABLED */
1755#ifdef DEBUG_ENCODING
1756 switch (ret) {
1757 case 0:
1758 xmlGenericError(xmlGenericErrorContext,
1759 "converted %d bytes to %d bytes of input\n",
1760 toconv, written);
1761 break;
1762 case -1:
1763 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1764 toconv, written, in->use);
1765 break;
1766 case -2:
1767 xmlGenericError(xmlGenericErrorContext,
1768 "input conversion failed due to input error\n");
1769 break;
1770 case -3:
1771 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1772 toconv, written, in->use);
1773 break;
1774 default:
1775 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
1776 }
1777#endif
1778 /*
1779 * Ignore when input buffer is not on a boundary
1780 */
1781 if (ret == -3) ret = 0;
1782 if (ret == -1) ret = 0;
1783 return(ret);
1784}
1785
1786/**
1787 * xmlCharEncInFunc:
1788 * @handler: char enconding transformation data structure
1789 * @out: an xmlBuffer for the output.
1790 * @in: an xmlBuffer for the input
1791 *
1792 * Generic front-end for the encoding handler input function
1793 *
1794 * Returns the number of byte written if success, or
1795 * -1 general error
1796 * -2 if the transcoding fails (for *in is not valid utf8 string or
1797 * the result of transformation can't fit into the encoding we want), or
1798 */
1799int
1800xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1801 xmlBufferPtr in) {
1802 int ret = -2;
1803 int written;
1804 int toconv;
1805
1806 if (handler == NULL) return(-1);
1807 if (out == NULL) return(-1);
1808 if (in == NULL) return(-1);
1809
1810 toconv = in->use;
1811 if (toconv == 0)
1812 return(0);
1813 written = out->size - out->use;
1814 if (toconv * 2 >= written) {
1815 xmlBufferGrow(out, out->size + toconv * 2);
1816 written = out->size - out->use - 1;
1817 }
1818 if (handler->input != NULL) {
1819 ret = handler->input(&out->content[out->use], &written,
1820 in->content, &toconv);
1821 xmlBufferShrink(in, toconv);
1822 out->use += written;
1823 out->content[out->use] = 0;
1824 }
1825#ifdef LIBXML_ICONV_ENABLED
1826 else if (handler->iconv_in != NULL) {
1827 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1828 &written, in->content, &toconv);
1829 xmlBufferShrink(in, toconv);
1830 out->use += written;
1831 out->content[out->use] = 0;
1832 if (ret == -1) ret = -3;
1833 }
1834#endif /* LIBXML_ICONV_ENABLED */
1835 switch (ret) {
1836#ifdef DEBUG_ENCODING
1837 case 0:
1838 xmlGenericError(xmlGenericErrorContext,
1839 "converted %d bytes to %d bytes of input\n",
1840 toconv, written);
1841 break;
1842 case -1:
1843 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1844 toconv, written, in->use);
1845 break;
1846 case -3:
1847 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1848 toconv, written, in->use);
1849 break;
1850#endif
1851 case -2:
1852 xmlGenericError(xmlGenericErrorContext,
1853 "input conversion failed due to input error\n");
1854 xmlGenericError(xmlGenericErrorContext,
1855 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1856 in->content[0], in->content[1],
1857 in->content[2], in->content[3]);
1858 }
1859 /*
1860 * Ignore when input buffer is not on a boundary
1861 */
1862 if (ret == -3) ret = 0;
1863 return(ret);
1864}
1865
1866/**
1867 * xmlCharEncOutFunc:
1868 * @handler: char enconding transformation data structure
1869 * @out: an xmlBuffer for the output.
1870 * @in: an xmlBuffer for the input
1871 *
1872 * Generic front-end for the encoding handler output function
1873 * a first call with @in == NULL has to be made firs to initiate the
1874 * output in case of non-stateless encoding needing to initiate their
1875 * state or the output (like the BOM in UTF16).
1876 * In case of UTF8 sequence conversion errors for the given encoder,
1877 * the content will be automatically remapped to a CharRef sequence.
1878 *
1879 * Returns the number of byte written if success, or
1880 * -1 general error
1881 * -2 if the transcoding fails (for *in is not valid utf8 string or
1882 * the result of transformation can't fit into the encoding we want), or
1883 */
1884int
1885xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1886 xmlBufferPtr in) {
1887 int ret = -2;
1888 int written;
1889 int writtentot = 0;
1890 int toconv;
1891 int output = 0;
1892
1893 if (handler == NULL) return(-1);
1894 if (out == NULL) return(-1);
1895
1896retry:
1897
1898 written = out->size - out->use;
1899
1900 /*
1901 * First specific handling of in = NULL, i.e. the initialization call
1902 */
1903 if (in == NULL) {
1904 toconv = 0;
1905 if (handler->output != NULL) {
1906 ret = handler->output(&out->content[out->use], &written,
1907 NULL, &toconv);
1908 out->use += written;
1909 out->content[out->use] = 0;
1910 }
1911#ifdef LIBXML_ICONV_ENABLED
1912 else if (handler->iconv_out != NULL) {
1913 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1914 &written, NULL, &toconv);
1915 out->use += written;
1916 out->content[out->use] = 0;
1917 }
1918#endif /* LIBXML_ICONV_ENABLED */
1919#ifdef DEBUG_ENCODING
1920 xmlGenericError(xmlGenericErrorContext,
1921 "initialized encoder\n");
1922#endif
1923 return(0);
1924 }
1925
1926 /*
1927 * Convertion itself.
1928 */
1929 toconv = in->use;
1930 if (toconv == 0)
1931 return(0);
1932 if (toconv * 2 >= written) {
1933 xmlBufferGrow(out, toconv * 2);
1934 written = out->size - out->use - 1;
1935 }
1936 if (handler->output != NULL) {
1937 ret = handler->output(&out->content[out->use], &written,
1938 in->content, &toconv);
1939 xmlBufferShrink(in, toconv);
1940 out->use += written;
1941 writtentot += written;
1942 out->content[out->use] = 0;
1943 }
1944#ifdef LIBXML_ICONV_ENABLED
1945 else if (handler->iconv_out != NULL) {
1946 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1947 &written, in->content, &toconv);
1948 xmlBufferShrink(in, toconv);
1949 out->use += written;
1950 writtentot += written;
1951 out->content[out->use] = 0;
1952 if (ret == -1) {
1953 if (written > 0) {
1954 /*
1955 * Can be a limitation of iconv
1956 */
1957 goto retry;
1958 }
1959 ret = -3;
1960 }
1961 }
1962#endif /* LIBXML_ICONV_ENABLED */
1963 else {
1964 xmlGenericError(xmlGenericErrorContext,
1965 "xmlCharEncOutFunc: no output function !\n");
1966 return(-1);
1967 }
1968
1969 if (ret >= 0) output += ret;
1970
1971 /*
1972 * Attempt to handle error cases
1973 */
1974 switch (ret) {
1975#ifdef DEBUG_ENCODING
1976 case 0:
1977 xmlGenericError(xmlGenericErrorContext,
1978 "converted %d bytes to %d bytes of output\n",
1979 toconv, written);
1980 break;
1981 case -1:
1982 xmlGenericError(xmlGenericErrorContext,
1983 "output conversion failed by lack of space\n");
1984 break;
1985#endif
1986 case -3:
1987 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
1988 toconv, written, in->use);
1989 break;
1990 case -2: {
1991 int len = in->use;
1992 const xmlChar *utf = (const xmlChar *) in->content;
1993 int cur;
1994
1995 cur = xmlGetUTF8Char(utf, &len);
1996 if (cur > 0) {
1997 xmlChar charref[20];
1998
1999#ifdef DEBUG_ENCODING
2000 xmlGenericError(xmlGenericErrorContext,
2001 "handling output conversion error\n");
2002 xmlGenericError(xmlGenericErrorContext,
2003 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2004 in->content[0], in->content[1],
2005 in->content[2], in->content[3]);
2006#endif
2007 /*
2008 * Removes the UTF8 sequence, and replace it by a charref
2009 * and continue the transcoding phase, hoping the error
2010 * did not mangle the encoder state.
2011 */
2012 sprintf((char *) charref, "&#x%X;", cur);
2013 xmlBufferShrink(in, len);
2014 xmlBufferAddHead(in, charref, -1);
2015
2016 goto retry;
2017 } else {
2018 xmlGenericError(xmlGenericErrorContext,
2019 "output conversion failed due to conv error\n");
2020 xmlGenericError(xmlGenericErrorContext,
2021 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2022 in->content[0], in->content[1],
2023 in->content[2], in->content[3]);
2024 in->content[0] = ' ';
2025 }
2026 break;
2027 }
2028 }
2029 return(ret);
2030}
2031
2032/**
2033 * xmlCharEncCloseFunc:
2034 * @handler: char enconding transformation data structure
2035 *
2036 * Generic front-end for hencoding handler close function
2037 *
2038 * Returns 0 if success, or -1 in case of error
2039 */
2040int
2041xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2042 int ret = 0;
2043 if (handler == NULL) return(-1);
2044 if (handler->name == NULL) return(-1);
2045#ifdef LIBXML_ICONV_ENABLED
2046 /*
2047 * Iconv handlers can be oused only once, free the whole block.
2048 * and the associated icon resources.
2049 */
2050 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2051 if (handler->name != NULL)
2052 xmlFree(handler->name);
2053 handler->name = NULL;
2054 if (handler->iconv_out != NULL) {
2055 if (iconv_close(handler->iconv_out))
2056 ret = -1;
2057 handler->iconv_out = NULL;
2058 }
2059 if (handler->iconv_in != NULL) {
2060 if (iconv_close(handler->iconv_in))
2061 ret = -1;
2062 handler->iconv_in = NULL;
2063 }
2064 xmlFree(handler);
2065 }
2066#endif /* LIBXML_ICONV_ENABLED */
2067#ifdef DEBUG_ENCODING
2068 if (ret)
2069 xmlGenericError(xmlGenericErrorContext,
2070 "failed to close the encoding handler\n");
2071 else
2072 xmlGenericError(xmlGenericErrorContext,
2073 "closed the encoding handler\n");
2074
2075#endif
2076 return(ret);
2077}
2078