blob: f03285de8cbc2fe1709300b41c80a6dc678918e0 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
16 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
17 *
18 * See Copyright for the status of this software.
19 *
20 * Daniel.Veillard@w3.org
21 */
22
Bjorn Reese70a9da52001-04-21 16:57:29 +000023#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000024
Owen Taylor3473f882001-02-23 17:55:21 +000025#include <string.h>
26
27#ifdef HAVE_CTYPE_H
28#include <ctype.h>
29#endif
30#ifdef HAVE_STDLIB_H
31#include <stdlib.h>
32#endif
Owen Taylor3473f882001-02-23 17:55:21 +000033#ifdef LIBXML_ICONV_ENABLED
34#ifdef HAVE_ERRNO_H
35#include <errno.h>
36#endif
37#endif
38#include <libxml/encoding.h>
39#include <libxml/xmlmemory.h>
40#ifdef LIBXML_HTML_ENABLED
41#include <libxml/HTMLparser.h>
42#endif
43#include <libxml/xmlerror.h>
44
45xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
46xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
47
48typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
49typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
50struct _xmlCharEncodingAlias {
51 const char *name;
52 const char *alias;
53};
54
55static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
56static int xmlCharEncodingAliasesNb = 0;
57static int xmlCharEncodingAliasesMax = 0;
58
59#ifdef LIBXML_ICONV_ENABLED
60#if 0
61#define DEBUG_ENCODING /* Define this to get encoding traces */
62#endif
63#endif
64
65static int xmlLittleEndian = 1;
66
67/*
68 * From rfc2044: encoding of the Unicode values on UTF-8:
69 *
70 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
71 * 0000 0000-0000 007F 0xxxxxxx
72 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
73 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
74 *
75 * I hope we won't use values > 0xFFFF anytime soon !
76 */
77
78/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000079 * xmlUTF8Strlen:
80 * @utf: a sequence of UTF-8 encoded bytes
81 *
82 * compute the lenght of an UTF8 string, it doesn't do a full UTF8
83 * checking of the content of the string.
84 *
85 * Returns the number of characters in the string or -1 in case of error
86 */
87int
88xmlUTF8Strlen(const unsigned char *utf) {
89 int ret = 0;
90
91 if (utf == NULL)
92 return(-1);
93
94 while (*utf != 0) {
95 if (utf[0] & 0x80) {
96 if ((utf[1] & 0xc0) != 0x80)
97 return(-1);
98 if ((utf[0] & 0xe0) == 0xe0) {
99 if ((utf[2] & 0xc0) != 0x80)
100 return(-1);
101 if ((utf[0] & 0xf0) == 0xf0) {
102 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
103 return(-1);
104 utf += 4;
105 } else {
106 utf += 3;
107 }
108 } else {
109 utf += 2;
110 }
111 } else {
112 utf++;
113 }
114 ret++;
115 }
116 return(ret);
117}
118
119/**
Owen Taylor3473f882001-02-23 17:55:21 +0000120 * xmlGetUTF8Char:
121 * @utf: a sequence of UTF-8 encoded bytes
122 * @len: a pointer to @bytes len
123 *
124 * Read one UTF8 Char from @utf
125 *
126 * Returns the char value or -1 in case of error and update @len with the
127 * number of bytes used
128 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000129static int
Owen Taylor3473f882001-02-23 17:55:21 +0000130xmlGetUTF8Char(const unsigned char *utf, int *len) {
131 unsigned int c;
132
133 if (utf == NULL)
134 goto error;
135 if (len == NULL)
136 goto error;
137 if (*len < 1)
138 goto error;
139
140 c = utf[0];
141 if (c & 0x80) {
142 if (*len < 2)
143 goto error;
144 if ((utf[1] & 0xc0) != 0x80)
145 goto error;
146 if ((c & 0xe0) == 0xe0) {
147 if (*len < 3)
148 goto error;
149 if ((utf[2] & 0xc0) != 0x80)
150 goto error;
151 if ((c & 0xf0) == 0xf0) {
152 if (*len < 4)
153 goto error;
154 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
155 goto error;
156 *len = 4;
157 /* 4-byte code */
158 c = (utf[0] & 0x7) << 18;
159 c |= (utf[1] & 0x3f) << 12;
160 c |= (utf[2] & 0x3f) << 6;
161 c |= utf[3] & 0x3f;
162 } else {
163 /* 3-byte code */
164 *len = 3;
165 c = (utf[0] & 0xf) << 12;
166 c |= (utf[1] & 0x3f) << 6;
167 c |= utf[2] & 0x3f;
168 }
169 } else {
170 /* 2-byte code */
171 *len = 2;
172 c = (utf[0] & 0x1f) << 6;
173 c |= utf[1] & 0x3f;
174 }
175 } else {
176 /* 1-byte code */
177 *len = 1;
178 }
179 return(c);
180
181error:
182 *len = 0;
183 return(-1);
184}
185
186/**
187 * xmlCheckUTF8: Check utf-8 string for legality.
188 * @utf: Pointer to putative utf-8 encoded string.
189 *
190 * Checks @utf for being valid utf-8. @utf is assumed to be
191 * null-terminated. This function is not super-strict, as it will
192 * allow longer utf-8 sequences than necessary. Note that Java is
193 * capable of producing these sequences if provoked. Also note, this
194 * routine checks for the 4-byte maxiumum size, but does not check for
195 * 0x10ffff maximum value.
196 *
197 * Return value: true if @utf is valid.
198 **/
199int
200xmlCheckUTF8(const unsigned char *utf)
201{
202 int ix;
203 unsigned char c;
204
205 for (ix = 0; (c = utf[ix]);) {
206 if (c & 0x80) {
207 if ((utf[ix + 1] & 0xc0) != 0x80)
208 return(0);
209 if ((c & 0xe0) == 0xe0) {
210 if ((utf[ix + 2] & 0xc0) != 0x80)
211 return(0);
212 if ((c & 0xf0) == 0xf0) {
213 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
214 return(0);
215 ix += 4;
216 /* 4-byte code */
217 } else
218 /* 3-byte code */
219 ix += 3;
220 } else
221 /* 2-byte code */
222 ix += 2;
223 } else
224 /* 1-byte code */
225 ix++;
226 }
227 return(1);
228}
229
230/**
231 * asciiToUTF8:
232 * @out: a pointer to an array of bytes to store the result
233 * @outlen: the length of @out
234 * @in: a pointer to an array of ASCII chars
235 * @inlen: the length of @in
236 *
237 * Take a block of ASCII chars in and try to convert it to an UTF-8
238 * block of chars out.
239 * Returns 0 if success, or -1 otherwise
240 * The value of @inlen after return is the number of octets consumed
241 * as the return value is positive, else unpredictiable.
242 * The value of @outlen after return is the number of ocetes consumed.
243 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000244static int
Owen Taylor3473f882001-02-23 17:55:21 +0000245asciiToUTF8(unsigned char* out, int *outlen,
246 const unsigned char* in, int *inlen) {
247 unsigned char* outstart = out;
248 const unsigned char* base = in;
249 const unsigned char* processed = in;
250 unsigned char* outend = out + *outlen;
251 const unsigned char* inend;
252 unsigned int c;
253 int bits;
254
255 inend = in + (*inlen);
256 while ((in < inend) && (out - outstart + 5 < *outlen)) {
257 c= *in++;
258
259 /* assertion: c is a single UTF-4 value */
260 if (out >= outend)
261 break;
262 if (c < 0x80) { *out++= c; bits= -6; }
263 else {
264 *outlen = out - outstart;
265 *inlen = processed - base;
266 return(-1);
267 }
268
269 for ( ; bits >= 0; bits-= 6) {
270 if (out >= outend)
271 break;
272 *out++= ((c >> bits) & 0x3F) | 0x80;
273 }
274 processed = (const unsigned char*) in;
275 }
276 *outlen = out - outstart;
277 *inlen = processed - base;
278 return(0);
279}
280
281/**
282 * UTF8Toascii:
283 * @out: a pointer to an array of bytes to store the result
284 * @outlen: the length of @out
285 * @in: a pointer to an array of UTF-8 chars
286 * @inlen: the length of @in
287 *
288 * Take a block of UTF-8 chars in and try to convert it to an ASCII
289 * block of chars out.
290 *
291 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
292 * The value of @inlen after return is the number of octets consumed
293 * as the return value is positive, else unpredictiable.
294 * The value of @outlen after return is the number of ocetes consumed.
295 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000296static int
Owen Taylor3473f882001-02-23 17:55:21 +0000297UTF8Toascii(unsigned char* out, int *outlen,
298 const unsigned char* in, int *inlen) {
299 const unsigned char* processed = in;
300 const unsigned char* outend;
301 const unsigned char* outstart = out;
302 const unsigned char* instart = in;
303 const unsigned char* inend;
304 unsigned int c, d;
305 int trailing;
306
307 if (in == NULL) {
308 /*
309 * initialization nothing to do
310 */
311 *outlen = 0;
312 *inlen = 0;
313 return(0);
314 }
315 inend = in + (*inlen);
316 outend = out + (*outlen);
317 while (in < inend) {
318 d = *in++;
319 if (d < 0x80) { c= d; trailing= 0; }
320 else if (d < 0xC0) {
321 /* trailing byte in leading position */
322 *outlen = out - outstart;
323 *inlen = processed - instart;
324 return(-2);
325 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
326 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
327 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
328 else {
329 /* no chance for this in Ascii */
330 *outlen = out - outstart;
331 *inlen = processed - instart;
332 return(-2);
333 }
334
335 if (inend - in < trailing) {
336 break;
337 }
338
339 for ( ; trailing; trailing--) {
340 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
341 break;
342 c <<= 6;
343 c |= d & 0x3F;
344 }
345
346 /* assertion: c is a single UTF-4 value */
347 if (c < 0x80) {
348 if (out >= outend)
349 break;
350 *out++ = c;
351 } else {
352 /* no chance for this in Ascii */
353 *outlen = out - outstart;
354 *inlen = processed - instart;
355 return(-2);
356 }
357 processed = in;
358 }
359 *outlen = out - outstart;
360 *inlen = processed - instart;
361 return(0);
362}
363
364/**
365 * isolat1ToUTF8:
366 * @out: a pointer to an array of bytes to store the result
367 * @outlen: the length of @out
368 * @in: a pointer to an array of ISO Latin 1 chars
369 * @inlen: the length of @in
370 *
371 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
372 * block of chars out.
373 * Returns 0 if success, or -1 otherwise
374 * The value of @inlen after return is the number of octets consumed
375 * as the return value is positive, else unpredictiable.
376 * The value of @outlen after return is the number of ocetes consumed.
377 */
378int
379isolat1ToUTF8(unsigned char* out, int *outlen,
380 const unsigned char* in, int *inlen) {
381 unsigned char* outstart = out;
382 const unsigned char* base = in;
383 const unsigned char* processed = in;
384 unsigned char* outend = out + *outlen;
385 const unsigned char* inend;
386 unsigned int c;
Owen Taylor3473f882001-02-23 17:55:21 +0000387
388 inend = in + (*inlen);
Daniel Veillard02141ea2001-04-30 11:46:40 +0000389 while (in < inend) {
390 c = *in++;
Owen Taylor3473f882001-02-23 17:55:21 +0000391
Owen Taylor3473f882001-02-23 17:55:21 +0000392 if (out >= outend)
393 break;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000394
395 if (c < 0x80) {
396 *out++ = c;
397 processed++;
398 continue;
399 } else {
400 *out++= ((c >> 6) & 0x1F) | 0xC0;
Owen Taylor3473f882001-02-23 17:55:21 +0000401 if (out >= outend)
Daniel Veillard02141ea2001-04-30 11:46:40 +0000402 break;
403 *out++= (c & 0x3F) | 0x80;
404 processed++;
Owen Taylor3473f882001-02-23 17:55:21 +0000405 }
Owen Taylor3473f882001-02-23 17:55:21 +0000406 }
407 *outlen = out - outstart;
408 *inlen = processed - base;
409 return(0);
410}
411
412/**
413 * UTF8Toisolat1:
414 * @out: a pointer to an array of bytes to store the result
415 * @outlen: the length of @out
416 * @in: a pointer to an array of UTF-8 chars
417 * @inlen: the length of @in
418 *
419 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
420 * block of chars out.
421 *
422 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
423 * The value of @inlen after return is the number of octets consumed
424 * as the return value is positive, else unpredictiable.
425 * The value of @outlen after return is the number of ocetes consumed.
426 */
427int
428UTF8Toisolat1(unsigned char* out, int *outlen,
429 const unsigned char* in, int *inlen) {
430 const unsigned char* processed = in;
431 const unsigned char* outend;
432 const unsigned char* outstart = out;
433 const unsigned char* instart = in;
434 const unsigned char* inend;
435 unsigned int c, d;
436 int trailing;
437
438 if (in == NULL) {
439 /*
440 * initialization nothing to do
441 */
442 *outlen = 0;
443 *inlen = 0;
444 return(0);
445 }
446 inend = in + (*inlen);
447 outend = out + (*outlen);
448 while (in < inend) {
449 d = *in++;
450 if (d < 0x80) { c= d; trailing= 0; }
451 else if (d < 0xC0) {
452 /* trailing byte in leading position */
453 *outlen = out - outstart;
454 *inlen = processed - instart;
455 return(-2);
456 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
457 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
458 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
459 else {
460 /* no chance for this in IsoLat1 */
461 *outlen = out - outstart;
462 *inlen = processed - instart;
463 return(-2);
464 }
465
466 if (inend - in < trailing) {
467 break;
468 }
469
470 for ( ; trailing; trailing--) {
471 if (in >= inend)
472 break;
473 if (((d= *in++) & 0xC0) != 0x80) {
474 *outlen = out - outstart;
475 *inlen = processed - instart;
476 return(-2);
477 }
478 c <<= 6;
479 c |= d & 0x3F;
480 }
481
482 /* assertion: c is a single UTF-4 value */
483 if (c <= 0xFF) {
484 if (out >= outend)
485 break;
486 *out++ = c;
487 } else {
488 /* no chance for this in IsoLat1 */
489 *outlen = out - outstart;
490 *inlen = processed - instart;
491 return(-2);
492 }
493 processed = in;
494 }
495 *outlen = out - outstart;
496 *inlen = processed - instart;
497 return(0);
498}
499
500/**
501 * UTF16LEToUTF8:
502 * @out: a pointer to an array of bytes to store the result
503 * @outlen: the length of @out
504 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
505 * @inlenb: the length of @in in UTF-16LE chars
506 *
507 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
508 * block of chars out. This function assume the endian properity
509 * is the same between the native type of this machine and the
510 * inputed one.
511 *
512 * Returns the number of byte written, or -1 by lack of space, or -2
513 * if the transcoding fails (for *in is not valid utf16 string)
514 * The value of *inlen after return is the number of octets consumed
515 * as the return value is positive, else unpredictiable.
516 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000517static int
Owen Taylor3473f882001-02-23 17:55:21 +0000518UTF16LEToUTF8(unsigned char* out, int *outlen,
519 const unsigned char* inb, int *inlenb)
520{
521 unsigned char* outstart = out;
522 const unsigned char* processed = inb;
523 unsigned char* outend = out + *outlen;
524 unsigned short* in = (unsigned short*) inb;
525 unsigned short* inend;
526 unsigned int c, d, inlen;
527 unsigned char *tmp;
528 int bits;
529
530 if ((*inlenb % 2) == 1)
531 (*inlenb)--;
532 inlen = *inlenb / 2;
533 inend = in + inlen;
534 while ((in < inend) && (out - outstart + 5 < *outlen)) {
535 if (xmlLittleEndian) {
536 c= *in++;
537 } else {
538 tmp = (unsigned char *) in;
539 c = *tmp++;
540 c = c | (((unsigned int)*tmp) << 8);
541 in++;
542 }
543 if ((c & 0xFC00) == 0xD800) { /* surrogates */
544 if (in >= inend) { /* (in > inend) shouldn't happens */
545 break;
546 }
547 if (xmlLittleEndian) {
548 d = *in++;
549 } else {
550 tmp = (unsigned char *) in;
551 d = *tmp++;
552 d = d | (((unsigned int)*tmp) << 8);
553 in++;
554 }
555 if ((d & 0xFC00) == 0xDC00) {
556 c &= 0x03FF;
557 c <<= 10;
558 c |= d & 0x03FF;
559 c += 0x10000;
560 }
561 else {
562 *outlen = out - outstart;
563 *inlenb = processed - inb;
564 return(-2);
565 }
566 }
567
568 /* assertion: c is a single UTF-4 value */
569 if (out >= outend)
570 break;
571 if (c < 0x80) { *out++= c; bits= -6; }
572 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
573 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
574 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
575
576 for ( ; bits >= 0; bits-= 6) {
577 if (out >= outend)
578 break;
579 *out++= ((c >> bits) & 0x3F) | 0x80;
580 }
581 processed = (const unsigned char*) in;
582 }
583 *outlen = out - outstart;
584 *inlenb = processed - inb;
585 return(0);
586}
587
588/**
589 * UTF8ToUTF16LE:
590 * @outb: a pointer to an array of bytes to store the result
591 * @outlen: the length of @outb
592 * @in: a pointer to an array of UTF-8 chars
593 * @inlen: the length of @in
594 *
595 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
596 * block of chars out.
597 *
598 * Returns the number of byte written, or -1 by lack of space, or -2
599 * if the transcoding failed.
600 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000601static int
Owen Taylor3473f882001-02-23 17:55:21 +0000602UTF8ToUTF16LE(unsigned char* outb, int *outlen,
603 const unsigned char* in, int *inlen)
604{
605 unsigned short* out = (unsigned short*) outb;
606 const unsigned char* processed = in;
607 unsigned short* outstart= out;
608 unsigned short* outend;
609 const unsigned char* inend= in+*inlen;
610 unsigned int c, d;
611 int trailing;
612 unsigned char *tmp;
613 unsigned short tmp1, tmp2;
614
615 if (in == NULL) {
616 /*
617 * initialization, add the Byte Order Mark
618 */
619 if (*outlen >= 2) {
620 outb[0] = 0xFF;
621 outb[1] = 0xFE;
622 *outlen = 2;
623 *inlen = 0;
624#ifdef DEBUG_ENCODING
625 xmlGenericError(xmlGenericErrorContext,
626 "Added FFFE Byte Order Mark\n");
627#endif
628 return(2);
629 }
630 *outlen = 0;
631 *inlen = 0;
632 return(0);
633 }
634 outend = out + (*outlen / 2);
635 while (in < inend) {
636 d= *in++;
637 if (d < 0x80) { c= d; trailing= 0; }
638 else if (d < 0xC0) {
639 /* trailing byte in leading position */
640 *outlen = (out - outstart) * 2;
641 *inlen = processed - in;
642 return(-2);
643 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
644 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
645 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
646 else {
647 /* no chance for this in UTF-16 */
648 *outlen = (out - outstart) * 2;
649 *inlen = processed - in;
650 return(-2);
651 }
652
653 if (inend - in < trailing) {
654 break;
655 }
656
657 for ( ; trailing; trailing--) {
658 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
659 break;
660 c <<= 6;
661 c |= d & 0x3F;
662 }
663
664 /* assertion: c is a single UTF-4 value */
665 if (c < 0x10000) {
666 if (out >= outend)
667 break;
668 if (xmlLittleEndian) {
669 *out++ = c;
670 } else {
671 tmp = (unsigned char *) out;
672 *tmp = c ;
673 *(tmp + 1) = c >> 8 ;
674 out++;
675 }
676 }
677 else if (c < 0x110000) {
678 if (out+1 >= outend)
679 break;
680 c -= 0x10000;
681 if (xmlLittleEndian) {
682 *out++ = 0xD800 | (c >> 10);
683 *out++ = 0xDC00 | (c & 0x03FF);
684 } else {
685 tmp1 = 0xD800 | (c >> 10);
686 tmp = (unsigned char *) out;
687 *tmp = (unsigned char) tmp1;
688 *(tmp + 1) = tmp1 >> 8;
689 out++;
690
691 tmp2 = 0xDC00 | (c & 0x03FF);
692 tmp = (unsigned char *) out;
693 *tmp = (unsigned char) tmp2;
694 *(tmp + 1) = tmp2 >> 8;
695 out++;
696 }
697 }
698 else
699 break;
700 processed = in;
701 }
702 *outlen = (out - outstart) * 2;
703 *inlen = processed - in;
704 return(0);
705}
706
707/**
708 * UTF16BEToUTF8:
709 * @out: a pointer to an array of bytes to store the result
710 * @outlen: the length of @out
711 * @inb: a pointer to an array of UTF-16 passwd as a byte array
712 * @inlenb: the length of @in in UTF-16 chars
713 *
714 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
715 * block of chars out. This function assume the endian properity
716 * is the same between the native type of this machine and the
717 * inputed one.
718 *
719 * Returns the number of byte written, or -1 by lack of space, or -2
720 * if the transcoding fails (for *in is not valid utf16 string)
721 * The value of *inlen after return is the number of octets consumed
722 * as the return value is positive, else unpredictiable.
723 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000724static int
Owen Taylor3473f882001-02-23 17:55:21 +0000725UTF16BEToUTF8(unsigned char* out, int *outlen,
726 const unsigned char* inb, int *inlenb)
727{
728 unsigned char* outstart = out;
729 const unsigned char* processed = inb;
730 unsigned char* outend = out + *outlen;
731 unsigned short* in = (unsigned short*) inb;
732 unsigned short* inend;
733 unsigned int c, d, inlen;
734 unsigned char *tmp;
735 int bits;
736
737 if ((*inlenb % 2) == 1)
738 (*inlenb)--;
739 inlen = *inlenb / 2;
740 inend= in + inlen;
741 while (in < inend) {
742 if (xmlLittleEndian) {
743 tmp = (unsigned char *) in;
744 c = *tmp++;
745 c = c << 8;
746 c = c | (unsigned int) *tmp;
747 in++;
748 } else {
749 c= *in++;
750 }
751 if ((c & 0xFC00) == 0xD800) { /* surrogates */
752 if (in >= inend) { /* (in > inend) shouldn't happens */
753 *outlen = out - outstart;
754 *inlenb = processed - inb;
755 return(-2);
756 }
757 if (xmlLittleEndian) {
758 tmp = (unsigned char *) in;
759 d = *tmp++;
760 d = d << 8;
761 d = d | (unsigned int) *tmp;
762 in++;
763 } else {
764 d= *in++;
765 }
766 if ((d & 0xFC00) == 0xDC00) {
767 c &= 0x03FF;
768 c <<= 10;
769 c |= d & 0x03FF;
770 c += 0x10000;
771 }
772 else {
773 *outlen = out - outstart;
774 *inlenb = processed - inb;
775 return(-2);
776 }
777 }
778
779 /* assertion: c is a single UTF-4 value */
780 if (out >= outend)
781 break;
782 if (c < 0x80) { *out++= c; bits= -6; }
783 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
784 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
785 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
786
787 for ( ; bits >= 0; bits-= 6) {
788 if (out >= outend)
789 break;
790 *out++= ((c >> bits) & 0x3F) | 0x80;
791 }
792 processed = (const unsigned char*) in;
793 }
794 *outlen = out - outstart;
795 *inlenb = processed - inb;
796 return(0);
797}
798
799/**
800 * UTF8ToUTF16BE:
801 * @outb: a pointer to an array of bytes to store the result
802 * @outlen: the length of @outb
803 * @in: a pointer to an array of UTF-8 chars
804 * @inlen: the length of @in
805 *
806 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
807 * block of chars out.
808 *
809 * Returns the number of byte written, or -1 by lack of space, or -2
810 * if the transcoding failed.
811 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000812static int
Owen Taylor3473f882001-02-23 17:55:21 +0000813UTF8ToUTF16BE(unsigned char* outb, int *outlen,
814 const unsigned char* in, int *inlen)
815{
816 unsigned short* out = (unsigned short*) outb;
817 const unsigned char* processed = in;
818 unsigned short* outstart= out;
819 unsigned short* outend;
820 const unsigned char* inend= in+*inlen;
821 unsigned int c, d;
822 int trailing;
823 unsigned char *tmp;
824 unsigned short tmp1, tmp2;
825
826 if (in == NULL) {
827 /*
828 * initialization, add the Byte Order Mark
829 */
830 if (*outlen >= 2) {
831 outb[0] = 0xFE;
832 outb[1] = 0xFF;
833 *outlen = 2;
834 *inlen = 0;
835#ifdef DEBUG_ENCODING
836 xmlGenericError(xmlGenericErrorContext,
837 "Added FEFF Byte Order Mark\n");
838#endif
839 return(2);
840 }
841 *outlen = 0;
842 *inlen = 0;
843 return(0);
844 }
845 outend = out + (*outlen / 2);
846 while (in < inend) {
847 d= *in++;
848 if (d < 0x80) { c= d; trailing= 0; }
849 else if (d < 0xC0) {
850 /* trailing byte in leading position */
851 *outlen = out - outstart;
852 *inlen = processed - in;
853 return(-2);
854 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
855 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
856 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
857 else {
858 /* no chance for this in UTF-16 */
859 *outlen = out - outstart;
860 *inlen = processed - in;
861 return(-2);
862 }
863
864 if (inend - in < trailing) {
865 break;
866 }
867
868 for ( ; trailing; trailing--) {
869 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
870 c <<= 6;
871 c |= d & 0x3F;
872 }
873
874 /* assertion: c is a single UTF-4 value */
875 if (c < 0x10000) {
876 if (out >= outend) break;
877 if (xmlLittleEndian) {
878 tmp = (unsigned char *) out;
879 *tmp = c >> 8;
880 *(tmp + 1) = c;
881 out++;
882 } else {
883 *out++ = c;
884 }
885 }
886 else if (c < 0x110000) {
887 if (out+1 >= outend) break;
888 c -= 0x10000;
889 if (xmlLittleEndian) {
890 tmp1 = 0xD800 | (c >> 10);
891 tmp = (unsigned char *) out;
892 *tmp = tmp1 >> 8;
893 *(tmp + 1) = (unsigned char) tmp1;
894 out++;
895
896 tmp2 = 0xDC00 | (c & 0x03FF);
897 tmp = (unsigned char *) out;
898 *tmp = tmp2 >> 8;
899 *(tmp + 1) = (unsigned char) tmp2;
900 out++;
901 } else {
902 *out++ = 0xD800 | (c >> 10);
903 *out++ = 0xDC00 | (c & 0x03FF);
904 }
905 }
906 else
907 break;
908 processed = in;
909 }
910 *outlen = (out - outstart) * 2;
911 *inlen = processed - in;
912 return(0);
913}
914
915/**
916 * xmlDetectCharEncoding:
917 * @in: a pointer to the first bytes of the XML entity, must be at least
918 * 4 bytes long.
919 * @len: pointer to the length of the buffer
920 *
921 * Guess the encoding of the entity using the first bytes of the entity content
922 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
923 *
924 * Returns one of the XML_CHAR_ENCODING_... values.
925 */
926xmlCharEncoding
927xmlDetectCharEncoding(const unsigned char* in, int len)
928{
929 if (len >= 4) {
930 if ((in[0] == 0x00) && (in[1] == 0x00) &&
931 (in[2] == 0x00) && (in[3] == 0x3C))
932 return(XML_CHAR_ENCODING_UCS4BE);
933 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
934 (in[2] == 0x00) && (in[3] == 0x00))
935 return(XML_CHAR_ENCODING_UCS4LE);
936 if ((in[0] == 0x00) && (in[1] == 0x00) &&
937 (in[2] == 0x3C) && (in[3] == 0x00))
938 return(XML_CHAR_ENCODING_UCS4_2143);
939 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
940 (in[2] == 0x00) && (in[3] == 0x00))
941 return(XML_CHAR_ENCODING_UCS4_3412);
942 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
943 (in[2] == 0xA7) && (in[3] == 0x94))
944 return(XML_CHAR_ENCODING_EBCDIC);
945 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
946 (in[2] == 0x78) && (in[3] == 0x6D))
947 return(XML_CHAR_ENCODING_UTF8);
948 }
949 if (len >= 2) {
950 if ((in[0] == 0xFE) && (in[1] == 0xFF))
951 return(XML_CHAR_ENCODING_UTF16BE);
952 if ((in[0] == 0xFF) && (in[1] == 0xFE))
953 return(XML_CHAR_ENCODING_UTF16LE);
954 }
955 return(XML_CHAR_ENCODING_NONE);
956}
957
958/**
959 * xmlCleanupEncodingAliases:
960 *
961 * Unregisters all aliases
962 */
963void
964xmlCleanupEncodingAliases(void) {
965 int i;
966
967 if (xmlCharEncodingAliases == NULL)
968 return;
969
970 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
971 if (xmlCharEncodingAliases[i].name != NULL)
972 xmlFree((char *) xmlCharEncodingAliases[i].name);
973 if (xmlCharEncodingAliases[i].alias != NULL)
974 xmlFree((char *) xmlCharEncodingAliases[i].alias);
975 }
976 xmlCharEncodingAliasesNb = 0;
977 xmlCharEncodingAliasesMax = 0;
978 xmlFree(xmlCharEncodingAliases);
979}
980
981/**
982 * xmlGetEncodingAlias:
983 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
984 *
985 * Lookup an encoding name for the given alias.
986 *
987 * Returns NULL if not found the original name otherwise
988 */
989const char *
990xmlGetEncodingAlias(const char *alias) {
991 int i;
992 char upper[100];
993
994 if (alias == NULL)
995 return(NULL);
996
997 if (xmlCharEncodingAliases == NULL)
998 return(NULL);
999
1000 for (i = 0;i < 99;i++) {
1001 upper[i] = toupper(alias[i]);
1002 if (upper[i] == 0) break;
1003 }
1004 upper[i] = 0;
1005
1006 /*
1007 * Walk down the list looking for a definition of the alias
1008 */
1009 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1010 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1011 return(xmlCharEncodingAliases[i].name);
1012 }
1013 }
1014 return(NULL);
1015}
1016
1017/**
1018 * xmlAddEncodingAlias:
1019 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1020 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1021 *
1022 * Registers and alias @alias for an encoding named @name. Existing alias
1023 * will be overwritten.
1024 *
1025 * Returns 0 in case of success, -1 in case of error
1026 */
1027int
1028xmlAddEncodingAlias(const char *name, const char *alias) {
1029 int i;
1030 char upper[100];
1031
1032 if ((name == NULL) || (alias == NULL))
1033 return(-1);
1034
1035 for (i = 0;i < 99;i++) {
1036 upper[i] = toupper(alias[i]);
1037 if (upper[i] == 0) break;
1038 }
1039 upper[i] = 0;
1040
1041 if (xmlCharEncodingAliases == NULL) {
1042 xmlCharEncodingAliasesNb = 0;
1043 xmlCharEncodingAliasesMax = 20;
1044 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1045 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1046 if (xmlCharEncodingAliases == NULL)
1047 return(-1);
1048 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1049 xmlCharEncodingAliasesMax *= 2;
1050 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1051 xmlRealloc(xmlCharEncodingAliases,
1052 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1053 }
1054 /*
1055 * Walk down the list looking for a definition of the alias
1056 */
1057 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1058 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1059 /*
1060 * Replace the definition.
1061 */
1062 xmlFree((char *) xmlCharEncodingAliases[i].name);
1063 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1064 return(0);
1065 }
1066 }
1067 /*
1068 * Add the definition
1069 */
1070 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1071 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1072 xmlCharEncodingAliasesNb++;
1073 return(0);
1074}
1075
1076/**
1077 * xmlDelEncodingAlias:
1078 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1079 *
1080 * Unregisters an encoding alias @alias
1081 *
1082 * Returns 0 in case of success, -1 in case of error
1083 */
1084int
1085xmlDelEncodingAlias(const char *alias) {
1086 int i;
1087
1088 if (alias == NULL)
1089 return(-1);
1090
1091 if (xmlCharEncodingAliases == NULL)
1092 return(-1);
1093 /*
1094 * Walk down the list looking for a definition of the alias
1095 */
1096 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1097 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1098 xmlFree((char *) xmlCharEncodingAliases[i].name);
1099 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1100 xmlCharEncodingAliasesNb--;
1101 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1102 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1103 return(0);
1104 }
1105 }
1106 return(-1);
1107}
1108
1109/**
1110 * xmlParseCharEncoding:
1111 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1112 *
1113 * Conpare the string to the known encoding schemes already known. Note
1114 * that the comparison is case insensitive accordingly to the section
1115 * [XML] 4.3.3 Character Encoding in Entities.
1116 *
1117 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1118 * if not recognized.
1119 */
1120xmlCharEncoding
1121xmlParseCharEncoding(const char* name)
1122{
1123 const char *alias;
1124 char upper[500];
1125 int i;
1126
1127 if (name == NULL)
1128 return(XML_CHAR_ENCODING_NONE);
1129
1130 /*
1131 * Do the alias resolution
1132 */
1133 alias = xmlGetEncodingAlias(name);
1134 if (alias != NULL)
1135 name = alias;
1136
1137 for (i = 0;i < 499;i++) {
1138 upper[i] = toupper(name[i]);
1139 if (upper[i] == 0) break;
1140 }
1141 upper[i] = 0;
1142
1143 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1144 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1145 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1146
1147 /*
1148 * NOTE: if we were able to parse this, the endianness of UTF16 is
1149 * already found and in use
1150 */
1151 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1152 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1153
1154 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1155 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1156 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1157
1158 /*
1159 * NOTE: if we were able to parse this, the endianness of UCS4 is
1160 * already found and in use
1161 */
1162 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1163 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1164 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1165
1166
1167 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1168 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1169 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1170
1171 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1172 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1173 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1174
1175 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1176 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1177 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1178 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1179 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1180 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1181 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1182
1183 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1184 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1185 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1186
1187#ifdef DEBUG_ENCODING
1188 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1189#endif
1190 return(XML_CHAR_ENCODING_ERROR);
1191}
1192
1193/**
1194 * xmlGetCharEncodingName:
1195 * @enc: the encoding
1196 *
1197 * The "canonical" name for XML encoding.
1198 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1199 * Section 4.3.3 Character Encoding in Entities
1200 *
1201 * Returns the canonical name for the given encoding
1202 */
1203
1204const char*
1205xmlGetCharEncodingName(xmlCharEncoding enc) {
1206 switch (enc) {
1207 case XML_CHAR_ENCODING_ERROR:
1208 return(NULL);
1209 case XML_CHAR_ENCODING_NONE:
1210 return(NULL);
1211 case XML_CHAR_ENCODING_UTF8:
1212 return("UTF-8");
1213 case XML_CHAR_ENCODING_UTF16LE:
1214 return("UTF-16");
1215 case XML_CHAR_ENCODING_UTF16BE:
1216 return("UTF-16");
1217 case XML_CHAR_ENCODING_EBCDIC:
1218 return("EBCDIC");
1219 case XML_CHAR_ENCODING_UCS4LE:
1220 return("ISO-10646-UCS-4");
1221 case XML_CHAR_ENCODING_UCS4BE:
1222 return("ISO-10646-UCS-4");
1223 case XML_CHAR_ENCODING_UCS4_2143:
1224 return("ISO-10646-UCS-4");
1225 case XML_CHAR_ENCODING_UCS4_3412:
1226 return("ISO-10646-UCS-4");
1227 case XML_CHAR_ENCODING_UCS2:
1228 return("ISO-10646-UCS-2");
1229 case XML_CHAR_ENCODING_8859_1:
1230 return("ISO-8859-1");
1231 case XML_CHAR_ENCODING_8859_2:
1232 return("ISO-8859-2");
1233 case XML_CHAR_ENCODING_8859_3:
1234 return("ISO-8859-3");
1235 case XML_CHAR_ENCODING_8859_4:
1236 return("ISO-8859-4");
1237 case XML_CHAR_ENCODING_8859_5:
1238 return("ISO-8859-5");
1239 case XML_CHAR_ENCODING_8859_6:
1240 return("ISO-8859-6");
1241 case XML_CHAR_ENCODING_8859_7:
1242 return("ISO-8859-7");
1243 case XML_CHAR_ENCODING_8859_8:
1244 return("ISO-8859-8");
1245 case XML_CHAR_ENCODING_8859_9:
1246 return("ISO-8859-9");
1247 case XML_CHAR_ENCODING_2022_JP:
1248 return("ISO-2022-JP");
1249 case XML_CHAR_ENCODING_SHIFT_JIS:
1250 return("Shift-JIS");
1251 case XML_CHAR_ENCODING_EUC_JP:
1252 return("EUC-JP");
1253 case XML_CHAR_ENCODING_ASCII:
1254 return(NULL);
1255 }
1256 return(NULL);
1257}
1258
1259/****************************************************************
1260 * *
1261 * Char encoding handlers *
1262 * *
1263 ****************************************************************/
1264
1265/* the size should be growable, but it's not a big deal ... */
1266#define MAX_ENCODING_HANDLERS 50
1267static xmlCharEncodingHandlerPtr *handlers = NULL;
1268static int nbCharEncodingHandler = 0;
1269
1270/*
1271 * The default is UTF-8 for XML, that's also the default used for the
1272 * parser internals, so the default encoding handler is NULL
1273 */
1274
1275static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1276
1277/**
1278 * xmlNewCharEncodingHandler:
1279 * @name: the encoding name, in UTF-8 format (ASCII actually)
1280 * @input: the xmlCharEncodingInputFunc to read that encoding
1281 * @output: the xmlCharEncodingOutputFunc to write that encoding
1282 *
1283 * Create and registers an xmlCharEncodingHandler.
1284 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1285 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001286static xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001287xmlNewCharEncodingHandler(const char *name,
1288 xmlCharEncodingInputFunc input,
1289 xmlCharEncodingOutputFunc output) {
1290 xmlCharEncodingHandlerPtr handler;
1291 const char *alias;
1292 char upper[500];
1293 int i;
1294 char *up = 0;
1295
1296 /*
1297 * Do the alias resolution
1298 */
1299 alias = xmlGetEncodingAlias(name);
1300 if (alias != NULL)
1301 name = alias;
1302
1303 /*
1304 * Keep only the uppercase version of the encoding.
1305 */
1306 if (name == NULL) {
1307 xmlGenericError(xmlGenericErrorContext,
1308 "xmlNewCharEncodingHandler : no name !\n");
1309 return(NULL);
1310 }
1311 for (i = 0;i < 499;i++) {
1312 upper[i] = toupper(name[i]);
1313 if (upper[i] == 0) break;
1314 }
1315 upper[i] = 0;
1316 up = xmlMemStrdup(upper);
1317 if (up == NULL) {
1318 xmlGenericError(xmlGenericErrorContext,
1319 "xmlNewCharEncodingHandler : out of memory !\n");
1320 return(NULL);
1321 }
1322
1323 /*
1324 * allocate and fill-up an handler block.
1325 */
1326 handler = (xmlCharEncodingHandlerPtr)
1327 xmlMalloc(sizeof(xmlCharEncodingHandler));
1328 if (handler == NULL) {
1329 xmlGenericError(xmlGenericErrorContext,
1330 "xmlNewCharEncodingHandler : out of memory !\n");
1331 return(NULL);
1332 }
1333 handler->input = input;
1334 handler->output = output;
1335 handler->name = up;
1336
1337#ifdef LIBXML_ICONV_ENABLED
1338 handler->iconv_in = NULL;
1339 handler->iconv_out = NULL;
1340#endif /* LIBXML_ICONV_ENABLED */
1341
1342 /*
1343 * registers and returns the handler.
1344 */
1345 xmlRegisterCharEncodingHandler(handler);
1346#ifdef DEBUG_ENCODING
1347 xmlGenericError(xmlGenericErrorContext,
1348 "Registered encoding handler for %s\n", name);
1349#endif
1350 return(handler);
1351}
1352
1353/**
1354 * xmlInitCharEncodingHandlers:
1355 *
1356 * Initialize the char encoding support, it registers the default
1357 * encoding supported.
1358 * NOTE: while public, this function usually doesn't need to be called
1359 * in normal processing.
1360 */
1361void
1362xmlInitCharEncodingHandlers(void) {
1363 unsigned short int tst = 0x1234;
1364 unsigned char *ptr = (unsigned char *) &tst;
1365
1366 if (handlers != NULL) return;
1367
1368 handlers = (xmlCharEncodingHandlerPtr *)
1369 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1370
1371 if (*ptr == 0x12) xmlLittleEndian = 0;
1372 else if (*ptr == 0x34) xmlLittleEndian = 1;
1373 else xmlGenericError(xmlGenericErrorContext,
1374 "Odd problem at endianness detection\n");
1375
1376 if (handlers == NULL) {
1377 xmlGenericError(xmlGenericErrorContext,
1378 "xmlInitCharEncodingHandlers : out of memory !\n");
1379 return;
1380 }
1381 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1382 xmlUTF16LEHandler =
1383 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1384 xmlUTF16BEHandler =
1385 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1386 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1387 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1388#ifdef LIBXML_HTML_ENABLED
1389 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1390#endif
1391}
1392
1393/**
1394 * xmlCleanupCharEncodingHandlers:
1395 *
1396 * Cleanup the memory allocated for the char encoding support, it
1397 * unregisters all the encoding handlers and the aliases.
1398 */
1399void
1400xmlCleanupCharEncodingHandlers(void) {
1401 xmlCleanupEncodingAliases();
1402
1403 if (handlers == NULL) return;
1404
1405 for (;nbCharEncodingHandler > 0;) {
1406 nbCharEncodingHandler--;
1407 if (handlers[nbCharEncodingHandler] != NULL) {
1408 if (handlers[nbCharEncodingHandler]->name != NULL)
1409 xmlFree(handlers[nbCharEncodingHandler]->name);
1410 xmlFree(handlers[nbCharEncodingHandler]);
1411 }
1412 }
1413 xmlFree(handlers);
1414 handlers = NULL;
1415 nbCharEncodingHandler = 0;
1416 xmlDefaultCharEncodingHandler = NULL;
1417}
1418
1419/**
1420 * xmlRegisterCharEncodingHandler:
1421 * @handler: the xmlCharEncodingHandlerPtr handler block
1422 *
1423 * Register the char encoding handler, surprizing, isn't it ?
1424 */
1425void
1426xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1427 if (handlers == NULL) xmlInitCharEncodingHandlers();
1428 if (handler == NULL) {
1429 xmlGenericError(xmlGenericErrorContext,
1430 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1431 return;
1432 }
1433
1434 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1435 xmlGenericError(xmlGenericErrorContext,
1436 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1437 xmlGenericError(xmlGenericErrorContext,
1438 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1439 return;
1440 }
1441 handlers[nbCharEncodingHandler++] = handler;
1442}
1443
1444/**
1445 * xmlGetCharEncodingHandler:
1446 * @enc: an xmlCharEncoding value.
1447 *
1448 * Search in the registrered set the handler able to read/write that encoding.
1449 *
1450 * Returns the handler or NULL if not found
1451 */
1452xmlCharEncodingHandlerPtr
1453xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1454 xmlCharEncodingHandlerPtr handler;
1455
1456 if (handlers == NULL) xmlInitCharEncodingHandlers();
1457 switch (enc) {
1458 case XML_CHAR_ENCODING_ERROR:
1459 return(NULL);
1460 case XML_CHAR_ENCODING_NONE:
1461 return(NULL);
1462 case XML_CHAR_ENCODING_UTF8:
1463 return(NULL);
1464 case XML_CHAR_ENCODING_UTF16LE:
1465 return(xmlUTF16LEHandler);
1466 case XML_CHAR_ENCODING_UTF16BE:
1467 return(xmlUTF16BEHandler);
1468 case XML_CHAR_ENCODING_EBCDIC:
1469 handler = xmlFindCharEncodingHandler("EBCDIC");
1470 if (handler != NULL) return(handler);
1471 handler = xmlFindCharEncodingHandler("ebcdic");
1472 if (handler != NULL) return(handler);
1473 break;
1474 case XML_CHAR_ENCODING_UCS4BE:
1475 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1476 if (handler != NULL) return(handler);
1477 handler = xmlFindCharEncodingHandler("UCS-4");
1478 if (handler != NULL) return(handler);
1479 handler = xmlFindCharEncodingHandler("UCS4");
1480 if (handler != NULL) return(handler);
1481 break;
1482 case XML_CHAR_ENCODING_UCS4LE:
1483 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1484 if (handler != NULL) return(handler);
1485 handler = xmlFindCharEncodingHandler("UCS-4");
1486 if (handler != NULL) return(handler);
1487 handler = xmlFindCharEncodingHandler("UCS4");
1488 if (handler != NULL) return(handler);
1489 break;
1490 case XML_CHAR_ENCODING_UCS4_2143:
1491 break;
1492 case XML_CHAR_ENCODING_UCS4_3412:
1493 break;
1494 case XML_CHAR_ENCODING_UCS2:
1495 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1496 if (handler != NULL) return(handler);
1497 handler = xmlFindCharEncodingHandler("UCS-2");
1498 if (handler != NULL) return(handler);
1499 handler = xmlFindCharEncodingHandler("UCS2");
1500 if (handler != NULL) return(handler);
1501 break;
1502
1503 /*
1504 * We used to keep ISO Latin encodings native in the
1505 * generated data. This led to so many problems that
1506 * this has been removed. One can still change this
1507 * back by registering no-ops encoders for those
1508 */
1509 case XML_CHAR_ENCODING_8859_1:
1510 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1511 if (handler != NULL) return(handler);
1512 break;
1513 case XML_CHAR_ENCODING_8859_2:
1514 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1515 if (handler != NULL) return(handler);
1516 break;
1517 case XML_CHAR_ENCODING_8859_3:
1518 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1519 if (handler != NULL) return(handler);
1520 break;
1521 case XML_CHAR_ENCODING_8859_4:
1522 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1523 if (handler != NULL) return(handler);
1524 break;
1525 case XML_CHAR_ENCODING_8859_5:
1526 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1527 if (handler != NULL) return(handler);
1528 break;
1529 case XML_CHAR_ENCODING_8859_6:
1530 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1531 if (handler != NULL) return(handler);
1532 break;
1533 case XML_CHAR_ENCODING_8859_7:
1534 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1535 if (handler != NULL) return(handler);
1536 break;
1537 case XML_CHAR_ENCODING_8859_8:
1538 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1539 if (handler != NULL) return(handler);
1540 break;
1541 case XML_CHAR_ENCODING_8859_9:
1542 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1543 if (handler != NULL) return(handler);
1544 break;
1545
1546
1547 case XML_CHAR_ENCODING_2022_JP:
1548 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1549 if (handler != NULL) return(handler);
1550 break;
1551 case XML_CHAR_ENCODING_SHIFT_JIS:
1552 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1553 if (handler != NULL) return(handler);
1554 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1555 if (handler != NULL) return(handler);
1556 handler = xmlFindCharEncodingHandler("Shift_JIS");
1557 if (handler != NULL) return(handler);
1558 break;
1559 case XML_CHAR_ENCODING_EUC_JP:
1560 handler = xmlFindCharEncodingHandler("EUC-JP");
1561 if (handler != NULL) return(handler);
1562 break;
1563 default:
1564 break;
1565 }
1566
1567#ifdef DEBUG_ENCODING
1568 xmlGenericError(xmlGenericErrorContext,
1569 "No handler found for encoding %d\n", enc);
1570#endif
1571 return(NULL);
1572}
1573
1574/**
1575 * xmlGetCharEncodingHandler:
1576 * @enc: a string describing the char encoding.
1577 *
1578 * Search in the registrered set the handler able to read/write that encoding.
1579 *
1580 * Returns the handler or NULL if not found
1581 */
1582xmlCharEncodingHandlerPtr
1583xmlFindCharEncodingHandler(const char *name) {
1584 const char *nalias;
1585 const char *norig;
1586 xmlCharEncoding alias;
1587#ifdef LIBXML_ICONV_ENABLED
1588 xmlCharEncodingHandlerPtr enc;
1589 iconv_t icv_in, icv_out;
1590#endif /* LIBXML_ICONV_ENABLED */
1591 char upper[100];
1592 int i;
1593
1594 if (handlers == NULL) xmlInitCharEncodingHandlers();
1595 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1596 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1597
1598 /*
1599 * Do the alias resolution
1600 */
1601 norig = name;
1602 nalias = xmlGetEncodingAlias(name);
1603 if (nalias != NULL)
1604 name = nalias;
1605
1606 /*
1607 * Check first for directly registered encoding names
1608 */
1609 for (i = 0;i < 99;i++) {
1610 upper[i] = toupper(name[i]);
1611 if (upper[i] == 0) break;
1612 }
1613 upper[i] = 0;
1614
1615 for (i = 0;i < nbCharEncodingHandler; i++)
1616 if (!strcmp(upper, handlers[i]->name)) {
1617#ifdef DEBUG_ENCODING
1618 xmlGenericError(xmlGenericErrorContext,
1619 "Found registered handler for encoding %s\n", name);
1620#endif
1621 return(handlers[i]);
1622 }
1623
1624#ifdef LIBXML_ICONV_ENABLED
1625 /* check whether iconv can handle this */
1626 icv_in = iconv_open("UTF-8", name);
1627 icv_out = iconv_open(name, "UTF-8");
1628 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1629 enc = (xmlCharEncodingHandlerPtr)
1630 xmlMalloc(sizeof(xmlCharEncodingHandler));
1631 if (enc == NULL) {
1632 iconv_close(icv_in);
1633 iconv_close(icv_out);
1634 return(NULL);
1635 }
1636 enc->name = xmlMemStrdup(name);
1637 enc->input = NULL;
1638 enc->output = NULL;
1639 enc->iconv_in = icv_in;
1640 enc->iconv_out = icv_out;
1641#ifdef DEBUG_ENCODING
1642 xmlGenericError(xmlGenericErrorContext,
1643 "Found iconv handler for encoding %s\n", name);
1644#endif
1645 return enc;
1646 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1647 xmlGenericError(xmlGenericErrorContext,
1648 "iconv : problems with filters for '%s'\n", name);
1649 }
1650#endif /* LIBXML_ICONV_ENABLED */
1651
1652#ifdef DEBUG_ENCODING
1653 xmlGenericError(xmlGenericErrorContext,
1654 "No handler found for encoding %s\n", name);
1655#endif
1656
1657 /*
1658 * Fallback using the canonical names
1659 */
1660 alias = xmlParseCharEncoding(norig);
1661 if (alias != XML_CHAR_ENCODING_ERROR) {
1662 const char* canon;
1663 canon = xmlGetCharEncodingName(alias);
1664 if ((canon != NULL) && (strcmp(name, canon))) {
1665 return(xmlFindCharEncodingHandler(canon));
1666 }
1667 }
1668
1669 return(NULL);
1670}
1671
1672#ifdef LIBXML_ICONV_ENABLED
1673/**
1674 * xmlIconvWrapper:
1675 * @cd: iconv converter data structure
1676 * @out: a pointer to an array of bytes to store the result
1677 * @outlen: the length of @out
1678 * @in: a pointer to an array of ISO Latin 1 chars
1679 * @inlen: the length of @in
1680 *
1681 * Returns 0 if success, or
1682 * -1 by lack of space, or
1683 * -2 if the transcoding fails (for *in is not valid utf8 string or
1684 * the result of transformation can't fit into the encoding we want), or
1685 * -3 if there the last byte can't form a single output char.
1686 *
1687 * The value of @inlen after return is the number of octets consumed
1688 * as the return value is positive, else unpredictiable.
1689 * The value of @outlen after return is the number of ocetes consumed.
1690 */
1691static int
1692xmlIconvWrapper(iconv_t cd,
1693 unsigned char *out, int *outlen,
1694 const unsigned char *in, int *inlen) {
1695
1696 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1697 const char *icv_in = (const char *) in;
1698 char *icv_out = (char *) out;
1699 int ret;
1700
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001701 ret = iconv(cd, &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001702 if (in != NULL) {
1703 *inlen -= icv_inlen;
1704 *outlen -= icv_outlen;
1705 } else {
1706 *inlen = 0;
1707 *outlen = 0;
1708 }
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001709 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001710#ifdef EILSEQ
1711 if (errno == EILSEQ) {
1712 return -2;
1713 } else
1714#endif
1715#ifdef E2BIG
1716 if (errno == E2BIG) {
1717 return -1;
1718 } else
1719#endif
1720#ifdef EINVAL
1721 if (errno == EINVAL) {
1722 return -3;
1723 } else
1724#endif
1725 {
1726 return -3;
1727 }
1728 }
1729 return 0;
1730}
1731#endif /* LIBXML_ICONV_ENABLED */
1732
1733/**
1734 * xmlCharEncFirstLine:
1735 * @handler: char enconding transformation data structure
1736 * @out: an xmlBuffer for the output.
1737 * @in: an xmlBuffer for the input
1738 *
1739 * Front-end for the encoding handler input function, but handle only
1740 * the very first line, i.e. limit itself to 45 chars.
1741 *
1742 * Returns the number of byte written if success, or
1743 * -1 general error
1744 * -2 if the transcoding fails (for *in is not valid utf8 string or
1745 * the result of transformation can't fit into the encoding we want), or
1746 */
1747int
1748xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1749 xmlBufferPtr in) {
1750 int ret = -2;
1751 int written;
1752 int toconv;
1753
1754 if (handler == NULL) return(-1);
1755 if (out == NULL) return(-1);
1756 if (in == NULL) return(-1);
1757
1758 written = out->size - out->use;
1759 toconv = in->use;
1760 if (toconv * 2 >= written) {
1761 xmlBufferGrow(out, toconv);
1762 written = out->size - out->use - 1;
1763 }
1764
1765 /*
1766 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1767 * 45 chars should be sufficient to reach the end of the encoding
1768 * decalration without going too far inside the document content.
1769 */
1770 written = 45;
1771
1772 if (handler->input != NULL) {
1773 ret = handler->input(&out->content[out->use], &written,
1774 in->content, &toconv);
1775 xmlBufferShrink(in, toconv);
1776 out->use += written;
1777 out->content[out->use] = 0;
1778 }
1779#ifdef LIBXML_ICONV_ENABLED
1780 else if (handler->iconv_in != NULL) {
1781 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1782 &written, in->content, &toconv);
1783 xmlBufferShrink(in, toconv);
1784 out->use += written;
1785 out->content[out->use] = 0;
1786 if (ret == -1) ret = -3;
1787 }
1788#endif /* LIBXML_ICONV_ENABLED */
1789#ifdef DEBUG_ENCODING
1790 switch (ret) {
1791 case 0:
1792 xmlGenericError(xmlGenericErrorContext,
1793 "converted %d bytes to %d bytes of input\n",
1794 toconv, written);
1795 break;
1796 case -1:
1797 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1798 toconv, written, in->use);
1799 break;
1800 case -2:
1801 xmlGenericError(xmlGenericErrorContext,
1802 "input conversion failed due to input error\n");
1803 break;
1804 case -3:
1805 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1806 toconv, written, in->use);
1807 break;
1808 default:
1809 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
1810 }
1811#endif
1812 /*
1813 * Ignore when input buffer is not on a boundary
1814 */
1815 if (ret == -3) ret = 0;
1816 if (ret == -1) ret = 0;
1817 return(ret);
1818}
1819
1820/**
1821 * xmlCharEncInFunc:
1822 * @handler: char enconding transformation data structure
1823 * @out: an xmlBuffer for the output.
1824 * @in: an xmlBuffer for the input
1825 *
1826 * Generic front-end for the encoding handler input function
1827 *
1828 * Returns the number of byte written if success, or
1829 * -1 general error
1830 * -2 if the transcoding fails (for *in is not valid utf8 string or
1831 * the result of transformation can't fit into the encoding we want), or
1832 */
1833int
1834xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1835 xmlBufferPtr in) {
1836 int ret = -2;
1837 int written;
1838 int toconv;
1839
1840 if (handler == NULL) return(-1);
1841 if (out == NULL) return(-1);
1842 if (in == NULL) return(-1);
1843
1844 toconv = in->use;
1845 if (toconv == 0)
1846 return(0);
1847 written = out->size - out->use;
1848 if (toconv * 2 >= written) {
1849 xmlBufferGrow(out, out->size + toconv * 2);
1850 written = out->size - out->use - 1;
1851 }
1852 if (handler->input != NULL) {
1853 ret = handler->input(&out->content[out->use], &written,
1854 in->content, &toconv);
1855 xmlBufferShrink(in, toconv);
1856 out->use += written;
1857 out->content[out->use] = 0;
1858 }
1859#ifdef LIBXML_ICONV_ENABLED
1860 else if (handler->iconv_in != NULL) {
1861 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1862 &written, in->content, &toconv);
1863 xmlBufferShrink(in, toconv);
1864 out->use += written;
1865 out->content[out->use] = 0;
1866 if (ret == -1) ret = -3;
1867 }
1868#endif /* LIBXML_ICONV_ENABLED */
1869 switch (ret) {
1870#ifdef DEBUG_ENCODING
1871 case 0:
1872 xmlGenericError(xmlGenericErrorContext,
1873 "converted %d bytes to %d bytes of input\n",
1874 toconv, written);
1875 break;
1876 case -1:
1877 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1878 toconv, written, in->use);
1879 break;
1880 case -3:
1881 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1882 toconv, written, in->use);
1883 break;
1884#endif
1885 case -2:
1886 xmlGenericError(xmlGenericErrorContext,
1887 "input conversion failed due to input error\n");
1888 xmlGenericError(xmlGenericErrorContext,
1889 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1890 in->content[0], in->content[1],
1891 in->content[2], in->content[3]);
1892 }
1893 /*
1894 * Ignore when input buffer is not on a boundary
1895 */
1896 if (ret == -3) ret = 0;
1897 return(ret);
1898}
1899
1900/**
1901 * xmlCharEncOutFunc:
1902 * @handler: char enconding transformation data structure
1903 * @out: an xmlBuffer for the output.
1904 * @in: an xmlBuffer for the input
1905 *
1906 * Generic front-end for the encoding handler output function
1907 * a first call with @in == NULL has to be made firs to initiate the
1908 * output in case of non-stateless encoding needing to initiate their
1909 * state or the output (like the BOM in UTF16).
1910 * In case of UTF8 sequence conversion errors for the given encoder,
1911 * the content will be automatically remapped to a CharRef sequence.
1912 *
1913 * Returns the number of byte written if success, or
1914 * -1 general error
1915 * -2 if the transcoding fails (for *in is not valid utf8 string or
1916 * the result of transformation can't fit into the encoding we want), or
1917 */
1918int
1919xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1920 xmlBufferPtr in) {
1921 int ret = -2;
1922 int written;
1923 int writtentot = 0;
1924 int toconv;
1925 int output = 0;
1926
1927 if (handler == NULL) return(-1);
1928 if (out == NULL) return(-1);
1929
1930retry:
1931
1932 written = out->size - out->use;
1933
1934 /*
1935 * First specific handling of in = NULL, i.e. the initialization call
1936 */
1937 if (in == NULL) {
1938 toconv = 0;
1939 if (handler->output != NULL) {
1940 ret = handler->output(&out->content[out->use], &written,
1941 NULL, &toconv);
1942 out->use += written;
1943 out->content[out->use] = 0;
1944 }
1945#ifdef LIBXML_ICONV_ENABLED
1946 else if (handler->iconv_out != NULL) {
1947 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1948 &written, NULL, &toconv);
1949 out->use += written;
1950 out->content[out->use] = 0;
1951 }
1952#endif /* LIBXML_ICONV_ENABLED */
1953#ifdef DEBUG_ENCODING
1954 xmlGenericError(xmlGenericErrorContext,
1955 "initialized encoder\n");
1956#endif
1957 return(0);
1958 }
1959
1960 /*
1961 * Convertion itself.
1962 */
1963 toconv = in->use;
1964 if (toconv == 0)
1965 return(0);
1966 if (toconv * 2 >= written) {
1967 xmlBufferGrow(out, toconv * 2);
1968 written = out->size - out->use - 1;
1969 }
1970 if (handler->output != NULL) {
1971 ret = handler->output(&out->content[out->use], &written,
1972 in->content, &toconv);
1973 xmlBufferShrink(in, toconv);
1974 out->use += written;
1975 writtentot += written;
1976 out->content[out->use] = 0;
1977 }
1978#ifdef LIBXML_ICONV_ENABLED
1979 else if (handler->iconv_out != NULL) {
1980 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1981 &written, in->content, &toconv);
1982 xmlBufferShrink(in, toconv);
1983 out->use += written;
1984 writtentot += written;
1985 out->content[out->use] = 0;
1986 if (ret == -1) {
1987 if (written > 0) {
1988 /*
1989 * Can be a limitation of iconv
1990 */
1991 goto retry;
1992 }
1993 ret = -3;
1994 }
1995 }
1996#endif /* LIBXML_ICONV_ENABLED */
1997 else {
1998 xmlGenericError(xmlGenericErrorContext,
1999 "xmlCharEncOutFunc: no output function !\n");
2000 return(-1);
2001 }
2002
2003 if (ret >= 0) output += ret;
2004
2005 /*
2006 * Attempt to handle error cases
2007 */
2008 switch (ret) {
2009#ifdef DEBUG_ENCODING
2010 case 0:
2011 xmlGenericError(xmlGenericErrorContext,
2012 "converted %d bytes to %d bytes of output\n",
2013 toconv, written);
2014 break;
2015 case -1:
2016 xmlGenericError(xmlGenericErrorContext,
2017 "output conversion failed by lack of space\n");
2018 break;
2019#endif
2020 case -3:
2021 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2022 toconv, written, in->use);
2023 break;
2024 case -2: {
2025 int len = in->use;
2026 const xmlChar *utf = (const xmlChar *) in->content;
2027 int cur;
2028
2029 cur = xmlGetUTF8Char(utf, &len);
2030 if (cur > 0) {
2031 xmlChar charref[20];
2032
2033#ifdef DEBUG_ENCODING
2034 xmlGenericError(xmlGenericErrorContext,
2035 "handling output conversion error\n");
2036 xmlGenericError(xmlGenericErrorContext,
2037 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2038 in->content[0], in->content[1],
2039 in->content[2], in->content[3]);
2040#endif
2041 /*
2042 * Removes the UTF8 sequence, and replace it by a charref
2043 * and continue the transcoding phase, hoping the error
2044 * did not mangle the encoder state.
2045 */
2046 sprintf((char *) charref, "&#x%X;", cur);
2047 xmlBufferShrink(in, len);
2048 xmlBufferAddHead(in, charref, -1);
2049
2050 goto retry;
2051 } else {
2052 xmlGenericError(xmlGenericErrorContext,
2053 "output conversion failed due to conv error\n");
2054 xmlGenericError(xmlGenericErrorContext,
2055 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2056 in->content[0], in->content[1],
2057 in->content[2], in->content[3]);
2058 in->content[0] = ' ';
2059 }
2060 break;
2061 }
2062 }
2063 return(ret);
2064}
2065
2066/**
2067 * xmlCharEncCloseFunc:
2068 * @handler: char enconding transformation data structure
2069 *
2070 * Generic front-end for hencoding handler close function
2071 *
2072 * Returns 0 if success, or -1 in case of error
2073 */
2074int
2075xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2076 int ret = 0;
2077 if (handler == NULL) return(-1);
2078 if (handler->name == NULL) return(-1);
2079#ifdef LIBXML_ICONV_ENABLED
2080 /*
2081 * Iconv handlers can be oused only once, free the whole block.
2082 * and the associated icon resources.
2083 */
2084 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2085 if (handler->name != NULL)
2086 xmlFree(handler->name);
2087 handler->name = NULL;
2088 if (handler->iconv_out != NULL) {
2089 if (iconv_close(handler->iconv_out))
2090 ret = -1;
2091 handler->iconv_out = NULL;
2092 }
2093 if (handler->iconv_in != NULL) {
2094 if (iconv_close(handler->iconv_in))
2095 ret = -1;
2096 handler->iconv_in = NULL;
2097 }
2098 xmlFree(handler);
2099 }
2100#endif /* LIBXML_ICONV_ENABLED */
2101#ifdef DEBUG_ENCODING
2102 if (ret)
2103 xmlGenericError(xmlGenericErrorContext,
2104 "failed to close the encoding handler\n");
2105 else
2106 xmlGenericError(xmlGenericErrorContext,
2107 "closed the encoding handler\n");
2108
2109#endif
2110 return(ret);
2111}
2112