blob: db7b0cf036592719fabd0f9430aaa64524b20dc8 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
16 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
17 *
18 * See Copyright for the status of this software.
19 *
20 * Daniel.Veillard@w3.org
21 */
22
23#ifdef WIN32
24#include "win32config.h"
25#else
26#include "config.h"
27#endif
28
29#include <stdio.h>
30#include <string.h>
31
32#ifdef HAVE_CTYPE_H
33#include <ctype.h>
34#endif
35#ifdef HAVE_STDLIB_H
36#include <stdlib.h>
37#endif
38#include <libxml/xmlversion.h>
39#ifdef LIBXML_ICONV_ENABLED
40#ifdef HAVE_ERRNO_H
41#include <errno.h>
42#endif
43#endif
44#include <libxml/encoding.h>
45#include <libxml/xmlmemory.h>
46#ifdef LIBXML_HTML_ENABLED
47#include <libxml/HTMLparser.h>
48#endif
49#include <libxml/xmlerror.h>
50
51xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
52xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
53
54typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
55typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
56struct _xmlCharEncodingAlias {
57 const char *name;
58 const char *alias;
59};
60
61static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
62static int xmlCharEncodingAliasesNb = 0;
63static int xmlCharEncodingAliasesMax = 0;
64
65#ifdef LIBXML_ICONV_ENABLED
66#if 0
67#define DEBUG_ENCODING /* Define this to get encoding traces */
68#endif
69#endif
70
71static int xmlLittleEndian = 1;
72
73/*
74 * From rfc2044: encoding of the Unicode values on UTF-8:
75 *
76 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
77 * 0000 0000-0000 007F 0xxxxxxx
78 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
79 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
80 *
81 * I hope we won't use values > 0xFFFF anytime soon !
82 */
83
84/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000085 * xmlUTF8Strlen:
86 * @utf: a sequence of UTF-8 encoded bytes
87 *
88 * compute the lenght of an UTF8 string, it doesn't do a full UTF8
89 * checking of the content of the string.
90 *
91 * Returns the number of characters in the string or -1 in case of error
92 */
93int
94xmlUTF8Strlen(const unsigned char *utf) {
95 int ret = 0;
96
97 if (utf == NULL)
98 return(-1);
99
100 while (*utf != 0) {
101 if (utf[0] & 0x80) {
102 if ((utf[1] & 0xc0) != 0x80)
103 return(-1);
104 if ((utf[0] & 0xe0) == 0xe0) {
105 if ((utf[2] & 0xc0) != 0x80)
106 return(-1);
107 if ((utf[0] & 0xf0) == 0xf0) {
108 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
109 return(-1);
110 utf += 4;
111 } else {
112 utf += 3;
113 }
114 } else {
115 utf += 2;
116 }
117 } else {
118 utf++;
119 }
120 ret++;
121 }
122 return(ret);
123}
124
125/**
Owen Taylor3473f882001-02-23 17:55:21 +0000126 * xmlGetUTF8Char:
127 * @utf: a sequence of UTF-8 encoded bytes
128 * @len: a pointer to @bytes len
129 *
130 * Read one UTF8 Char from @utf
131 *
132 * Returns the char value or -1 in case of error and update @len with the
133 * number of bytes used
134 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000135static int
Owen Taylor3473f882001-02-23 17:55:21 +0000136xmlGetUTF8Char(const unsigned char *utf, int *len) {
137 unsigned int c;
138
139 if (utf == NULL)
140 goto error;
141 if (len == NULL)
142 goto error;
143 if (*len < 1)
144 goto error;
145
146 c = utf[0];
147 if (c & 0x80) {
148 if (*len < 2)
149 goto error;
150 if ((utf[1] & 0xc0) != 0x80)
151 goto error;
152 if ((c & 0xe0) == 0xe0) {
153 if (*len < 3)
154 goto error;
155 if ((utf[2] & 0xc0) != 0x80)
156 goto error;
157 if ((c & 0xf0) == 0xf0) {
158 if (*len < 4)
159 goto error;
160 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
161 goto error;
162 *len = 4;
163 /* 4-byte code */
164 c = (utf[0] & 0x7) << 18;
165 c |= (utf[1] & 0x3f) << 12;
166 c |= (utf[2] & 0x3f) << 6;
167 c |= utf[3] & 0x3f;
168 } else {
169 /* 3-byte code */
170 *len = 3;
171 c = (utf[0] & 0xf) << 12;
172 c |= (utf[1] & 0x3f) << 6;
173 c |= utf[2] & 0x3f;
174 }
175 } else {
176 /* 2-byte code */
177 *len = 2;
178 c = (utf[0] & 0x1f) << 6;
179 c |= utf[1] & 0x3f;
180 }
181 } else {
182 /* 1-byte code */
183 *len = 1;
184 }
185 return(c);
186
187error:
188 *len = 0;
189 return(-1);
190}
191
192/**
193 * xmlCheckUTF8: Check utf-8 string for legality.
194 * @utf: Pointer to putative utf-8 encoded string.
195 *
196 * Checks @utf for being valid utf-8. @utf is assumed to be
197 * null-terminated. This function is not super-strict, as it will
198 * allow longer utf-8 sequences than necessary. Note that Java is
199 * capable of producing these sequences if provoked. Also note, this
200 * routine checks for the 4-byte maxiumum size, but does not check for
201 * 0x10ffff maximum value.
202 *
203 * Return value: true if @utf is valid.
204 **/
205int
206xmlCheckUTF8(const unsigned char *utf)
207{
208 int ix;
209 unsigned char c;
210
211 for (ix = 0; (c = utf[ix]);) {
212 if (c & 0x80) {
213 if ((utf[ix + 1] & 0xc0) != 0x80)
214 return(0);
215 if ((c & 0xe0) == 0xe0) {
216 if ((utf[ix + 2] & 0xc0) != 0x80)
217 return(0);
218 if ((c & 0xf0) == 0xf0) {
219 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
220 return(0);
221 ix += 4;
222 /* 4-byte code */
223 } else
224 /* 3-byte code */
225 ix += 3;
226 } else
227 /* 2-byte code */
228 ix += 2;
229 } else
230 /* 1-byte code */
231 ix++;
232 }
233 return(1);
234}
235
236/**
237 * asciiToUTF8:
238 * @out: a pointer to an array of bytes to store the result
239 * @outlen: the length of @out
240 * @in: a pointer to an array of ASCII chars
241 * @inlen: the length of @in
242 *
243 * Take a block of ASCII chars in and try to convert it to an UTF-8
244 * block of chars out.
245 * Returns 0 if success, or -1 otherwise
246 * The value of @inlen after return is the number of octets consumed
247 * as the return value is positive, else unpredictiable.
248 * The value of @outlen after return is the number of ocetes consumed.
249 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000250static int
Owen Taylor3473f882001-02-23 17:55:21 +0000251asciiToUTF8(unsigned char* out, int *outlen,
252 const unsigned char* in, int *inlen) {
253 unsigned char* outstart = out;
254 const unsigned char* base = in;
255 const unsigned char* processed = in;
256 unsigned char* outend = out + *outlen;
257 const unsigned char* inend;
258 unsigned int c;
259 int bits;
260
261 inend = in + (*inlen);
262 while ((in < inend) && (out - outstart + 5 < *outlen)) {
263 c= *in++;
264
265 /* assertion: c is a single UTF-4 value */
266 if (out >= outend)
267 break;
268 if (c < 0x80) { *out++= c; bits= -6; }
269 else {
270 *outlen = out - outstart;
271 *inlen = processed - base;
272 return(-1);
273 }
274
275 for ( ; bits >= 0; bits-= 6) {
276 if (out >= outend)
277 break;
278 *out++= ((c >> bits) & 0x3F) | 0x80;
279 }
280 processed = (const unsigned char*) in;
281 }
282 *outlen = out - outstart;
283 *inlen = processed - base;
284 return(0);
285}
286
287/**
288 * UTF8Toascii:
289 * @out: a pointer to an array of bytes to store the result
290 * @outlen: the length of @out
291 * @in: a pointer to an array of UTF-8 chars
292 * @inlen: the length of @in
293 *
294 * Take a block of UTF-8 chars in and try to convert it to an ASCII
295 * block of chars out.
296 *
297 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
298 * The value of @inlen after return is the number of octets consumed
299 * as the return value is positive, else unpredictiable.
300 * The value of @outlen after return is the number of ocetes consumed.
301 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000302static int
Owen Taylor3473f882001-02-23 17:55:21 +0000303UTF8Toascii(unsigned char* out, int *outlen,
304 const unsigned char* in, int *inlen) {
305 const unsigned char* processed = in;
306 const unsigned char* outend;
307 const unsigned char* outstart = out;
308 const unsigned char* instart = in;
309 const unsigned char* inend;
310 unsigned int c, d;
311 int trailing;
312
313 if (in == NULL) {
314 /*
315 * initialization nothing to do
316 */
317 *outlen = 0;
318 *inlen = 0;
319 return(0);
320 }
321 inend = in + (*inlen);
322 outend = out + (*outlen);
323 while (in < inend) {
324 d = *in++;
325 if (d < 0x80) { c= d; trailing= 0; }
326 else if (d < 0xC0) {
327 /* trailing byte in leading position */
328 *outlen = out - outstart;
329 *inlen = processed - instart;
330 return(-2);
331 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
332 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
333 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
334 else {
335 /* no chance for this in Ascii */
336 *outlen = out - outstart;
337 *inlen = processed - instart;
338 return(-2);
339 }
340
341 if (inend - in < trailing) {
342 break;
343 }
344
345 for ( ; trailing; trailing--) {
346 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
347 break;
348 c <<= 6;
349 c |= d & 0x3F;
350 }
351
352 /* assertion: c is a single UTF-4 value */
353 if (c < 0x80) {
354 if (out >= outend)
355 break;
356 *out++ = c;
357 } else {
358 /* no chance for this in Ascii */
359 *outlen = out - outstart;
360 *inlen = processed - instart;
361 return(-2);
362 }
363 processed = in;
364 }
365 *outlen = out - outstart;
366 *inlen = processed - instart;
367 return(0);
368}
369
370/**
371 * isolat1ToUTF8:
372 * @out: a pointer to an array of bytes to store the result
373 * @outlen: the length of @out
374 * @in: a pointer to an array of ISO Latin 1 chars
375 * @inlen: the length of @in
376 *
377 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
378 * block of chars out.
379 * Returns 0 if success, or -1 otherwise
380 * The value of @inlen after return is the number of octets consumed
381 * as the return value is positive, else unpredictiable.
382 * The value of @outlen after return is the number of ocetes consumed.
383 */
384int
385isolat1ToUTF8(unsigned char* out, int *outlen,
386 const unsigned char* in, int *inlen) {
387 unsigned char* outstart = out;
388 const unsigned char* base = in;
389 const unsigned char* processed = in;
390 unsigned char* outend = out + *outlen;
391 const unsigned char* inend;
392 unsigned int c;
393 int bits;
394
395 inend = in + (*inlen);
396 while ((in < inend) && (out - outstart + 5 < *outlen)) {
397 c= *in++;
398
399 /* assertion: c is a single UTF-4 value */
400 if (out >= outend)
401 break;
402 if (c < 0x80) { *out++= c; bits= -6; }
403 else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
404
405 for ( ; bits >= 0; bits-= 6) {
406 if (out >= outend)
407 break;
408 *out++= ((c >> bits) & 0x3F) | 0x80;
409 }
410 processed = (const unsigned char*) in;
411 }
412 *outlen = out - outstart;
413 *inlen = processed - base;
414 return(0);
415}
416
417/**
418 * UTF8Toisolat1:
419 * @out: a pointer to an array of bytes to store the result
420 * @outlen: the length of @out
421 * @in: a pointer to an array of UTF-8 chars
422 * @inlen: the length of @in
423 *
424 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
425 * block of chars out.
426 *
427 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
428 * The value of @inlen after return is the number of octets consumed
429 * as the return value is positive, else unpredictiable.
430 * The value of @outlen after return is the number of ocetes consumed.
431 */
432int
433UTF8Toisolat1(unsigned char* out, int *outlen,
434 const unsigned char* in, int *inlen) {
435 const unsigned char* processed = in;
436 const unsigned char* outend;
437 const unsigned char* outstart = out;
438 const unsigned char* instart = in;
439 const unsigned char* inend;
440 unsigned int c, d;
441 int trailing;
442
443 if (in == NULL) {
444 /*
445 * initialization nothing to do
446 */
447 *outlen = 0;
448 *inlen = 0;
449 return(0);
450 }
451 inend = in + (*inlen);
452 outend = out + (*outlen);
453 while (in < inend) {
454 d = *in++;
455 if (d < 0x80) { c= d; trailing= 0; }
456 else if (d < 0xC0) {
457 /* trailing byte in leading position */
458 *outlen = out - outstart;
459 *inlen = processed - instart;
460 return(-2);
461 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
462 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
463 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
464 else {
465 /* no chance for this in IsoLat1 */
466 *outlen = out - outstart;
467 *inlen = processed - instart;
468 return(-2);
469 }
470
471 if (inend - in < trailing) {
472 break;
473 }
474
475 for ( ; trailing; trailing--) {
476 if (in >= inend)
477 break;
478 if (((d= *in++) & 0xC0) != 0x80) {
479 *outlen = out - outstart;
480 *inlen = processed - instart;
481 return(-2);
482 }
483 c <<= 6;
484 c |= d & 0x3F;
485 }
486
487 /* assertion: c is a single UTF-4 value */
488 if (c <= 0xFF) {
489 if (out >= outend)
490 break;
491 *out++ = c;
492 } else {
493 /* no chance for this in IsoLat1 */
494 *outlen = out - outstart;
495 *inlen = processed - instart;
496 return(-2);
497 }
498 processed = in;
499 }
500 *outlen = out - outstart;
501 *inlen = processed - instart;
502 return(0);
503}
504
505/**
506 * UTF16LEToUTF8:
507 * @out: a pointer to an array of bytes to store the result
508 * @outlen: the length of @out
509 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
510 * @inlenb: the length of @in in UTF-16LE chars
511 *
512 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
513 * block of chars out. This function assume the endian properity
514 * is the same between the native type of this machine and the
515 * inputed one.
516 *
517 * Returns the number of byte written, or -1 by lack of space, or -2
518 * if the transcoding fails (for *in is not valid utf16 string)
519 * The value of *inlen after return is the number of octets consumed
520 * as the return value is positive, else unpredictiable.
521 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000522static int
Owen Taylor3473f882001-02-23 17:55:21 +0000523UTF16LEToUTF8(unsigned char* out, int *outlen,
524 const unsigned char* inb, int *inlenb)
525{
526 unsigned char* outstart = out;
527 const unsigned char* processed = inb;
528 unsigned char* outend = out + *outlen;
529 unsigned short* in = (unsigned short*) inb;
530 unsigned short* inend;
531 unsigned int c, d, inlen;
532 unsigned char *tmp;
533 int bits;
534
535 if ((*inlenb % 2) == 1)
536 (*inlenb)--;
537 inlen = *inlenb / 2;
538 inend = in + inlen;
539 while ((in < inend) && (out - outstart + 5 < *outlen)) {
540 if (xmlLittleEndian) {
541 c= *in++;
542 } else {
543 tmp = (unsigned char *) in;
544 c = *tmp++;
545 c = c | (((unsigned int)*tmp) << 8);
546 in++;
547 }
548 if ((c & 0xFC00) == 0xD800) { /* surrogates */
549 if (in >= inend) { /* (in > inend) shouldn't happens */
550 break;
551 }
552 if (xmlLittleEndian) {
553 d = *in++;
554 } else {
555 tmp = (unsigned char *) in;
556 d = *tmp++;
557 d = d | (((unsigned int)*tmp) << 8);
558 in++;
559 }
560 if ((d & 0xFC00) == 0xDC00) {
561 c &= 0x03FF;
562 c <<= 10;
563 c |= d & 0x03FF;
564 c += 0x10000;
565 }
566 else {
567 *outlen = out - outstart;
568 *inlenb = processed - inb;
569 return(-2);
570 }
571 }
572
573 /* assertion: c is a single UTF-4 value */
574 if (out >= outend)
575 break;
576 if (c < 0x80) { *out++= c; bits= -6; }
577 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
578 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
579 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
580
581 for ( ; bits >= 0; bits-= 6) {
582 if (out >= outend)
583 break;
584 *out++= ((c >> bits) & 0x3F) | 0x80;
585 }
586 processed = (const unsigned char*) in;
587 }
588 *outlen = out - outstart;
589 *inlenb = processed - inb;
590 return(0);
591}
592
593/**
594 * UTF8ToUTF16LE:
595 * @outb: a pointer to an array of bytes to store the result
596 * @outlen: the length of @outb
597 * @in: a pointer to an array of UTF-8 chars
598 * @inlen: the length of @in
599 *
600 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
601 * block of chars out.
602 *
603 * Returns the number of byte written, or -1 by lack of space, or -2
604 * if the transcoding failed.
605 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000606static int
Owen Taylor3473f882001-02-23 17:55:21 +0000607UTF8ToUTF16LE(unsigned char* outb, int *outlen,
608 const unsigned char* in, int *inlen)
609{
610 unsigned short* out = (unsigned short*) outb;
611 const unsigned char* processed = in;
612 unsigned short* outstart= out;
613 unsigned short* outend;
614 const unsigned char* inend= in+*inlen;
615 unsigned int c, d;
616 int trailing;
617 unsigned char *tmp;
618 unsigned short tmp1, tmp2;
619
620 if (in == NULL) {
621 /*
622 * initialization, add the Byte Order Mark
623 */
624 if (*outlen >= 2) {
625 outb[0] = 0xFF;
626 outb[1] = 0xFE;
627 *outlen = 2;
628 *inlen = 0;
629#ifdef DEBUG_ENCODING
630 xmlGenericError(xmlGenericErrorContext,
631 "Added FFFE Byte Order Mark\n");
632#endif
633 return(2);
634 }
635 *outlen = 0;
636 *inlen = 0;
637 return(0);
638 }
639 outend = out + (*outlen / 2);
640 while (in < inend) {
641 d= *in++;
642 if (d < 0x80) { c= d; trailing= 0; }
643 else if (d < 0xC0) {
644 /* trailing byte in leading position */
645 *outlen = (out - outstart) * 2;
646 *inlen = processed - in;
647 return(-2);
648 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
649 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
650 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
651 else {
652 /* no chance for this in UTF-16 */
653 *outlen = (out - outstart) * 2;
654 *inlen = processed - in;
655 return(-2);
656 }
657
658 if (inend - in < trailing) {
659 break;
660 }
661
662 for ( ; trailing; trailing--) {
663 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
664 break;
665 c <<= 6;
666 c |= d & 0x3F;
667 }
668
669 /* assertion: c is a single UTF-4 value */
670 if (c < 0x10000) {
671 if (out >= outend)
672 break;
673 if (xmlLittleEndian) {
674 *out++ = c;
675 } else {
676 tmp = (unsigned char *) out;
677 *tmp = c ;
678 *(tmp + 1) = c >> 8 ;
679 out++;
680 }
681 }
682 else if (c < 0x110000) {
683 if (out+1 >= outend)
684 break;
685 c -= 0x10000;
686 if (xmlLittleEndian) {
687 *out++ = 0xD800 | (c >> 10);
688 *out++ = 0xDC00 | (c & 0x03FF);
689 } else {
690 tmp1 = 0xD800 | (c >> 10);
691 tmp = (unsigned char *) out;
692 *tmp = (unsigned char) tmp1;
693 *(tmp + 1) = tmp1 >> 8;
694 out++;
695
696 tmp2 = 0xDC00 | (c & 0x03FF);
697 tmp = (unsigned char *) out;
698 *tmp = (unsigned char) tmp2;
699 *(tmp + 1) = tmp2 >> 8;
700 out++;
701 }
702 }
703 else
704 break;
705 processed = in;
706 }
707 *outlen = (out - outstart) * 2;
708 *inlen = processed - in;
709 return(0);
710}
711
712/**
713 * UTF16BEToUTF8:
714 * @out: a pointer to an array of bytes to store the result
715 * @outlen: the length of @out
716 * @inb: a pointer to an array of UTF-16 passwd as a byte array
717 * @inlenb: the length of @in in UTF-16 chars
718 *
719 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
720 * block of chars out. This function assume the endian properity
721 * is the same between the native type of this machine and the
722 * inputed one.
723 *
724 * Returns the number of byte written, or -1 by lack of space, or -2
725 * if the transcoding fails (for *in is not valid utf16 string)
726 * The value of *inlen after return is the number of octets consumed
727 * as the return value is positive, else unpredictiable.
728 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000729static int
Owen Taylor3473f882001-02-23 17:55:21 +0000730UTF16BEToUTF8(unsigned char* out, int *outlen,
731 const unsigned char* inb, int *inlenb)
732{
733 unsigned char* outstart = out;
734 const unsigned char* processed = inb;
735 unsigned char* outend = out + *outlen;
736 unsigned short* in = (unsigned short*) inb;
737 unsigned short* inend;
738 unsigned int c, d, inlen;
739 unsigned char *tmp;
740 int bits;
741
742 if ((*inlenb % 2) == 1)
743 (*inlenb)--;
744 inlen = *inlenb / 2;
745 inend= in + inlen;
746 while (in < inend) {
747 if (xmlLittleEndian) {
748 tmp = (unsigned char *) in;
749 c = *tmp++;
750 c = c << 8;
751 c = c | (unsigned int) *tmp;
752 in++;
753 } else {
754 c= *in++;
755 }
756 if ((c & 0xFC00) == 0xD800) { /* surrogates */
757 if (in >= inend) { /* (in > inend) shouldn't happens */
758 *outlen = out - outstart;
759 *inlenb = processed - inb;
760 return(-2);
761 }
762 if (xmlLittleEndian) {
763 tmp = (unsigned char *) in;
764 d = *tmp++;
765 d = d << 8;
766 d = d | (unsigned int) *tmp;
767 in++;
768 } else {
769 d= *in++;
770 }
771 if ((d & 0xFC00) == 0xDC00) {
772 c &= 0x03FF;
773 c <<= 10;
774 c |= d & 0x03FF;
775 c += 0x10000;
776 }
777 else {
778 *outlen = out - outstart;
779 *inlenb = processed - inb;
780 return(-2);
781 }
782 }
783
784 /* assertion: c is a single UTF-4 value */
785 if (out >= outend)
786 break;
787 if (c < 0x80) { *out++= c; bits= -6; }
788 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
789 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
790 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
791
792 for ( ; bits >= 0; bits-= 6) {
793 if (out >= outend)
794 break;
795 *out++= ((c >> bits) & 0x3F) | 0x80;
796 }
797 processed = (const unsigned char*) in;
798 }
799 *outlen = out - outstart;
800 *inlenb = processed - inb;
801 return(0);
802}
803
804/**
805 * UTF8ToUTF16BE:
806 * @outb: a pointer to an array of bytes to store the result
807 * @outlen: the length of @outb
808 * @in: a pointer to an array of UTF-8 chars
809 * @inlen: the length of @in
810 *
811 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
812 * block of chars out.
813 *
814 * Returns the number of byte written, or -1 by lack of space, or -2
815 * if the transcoding failed.
816 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000817static int
Owen Taylor3473f882001-02-23 17:55:21 +0000818UTF8ToUTF16BE(unsigned char* outb, int *outlen,
819 const unsigned char* in, int *inlen)
820{
821 unsigned short* out = (unsigned short*) outb;
822 const unsigned char* processed = in;
823 unsigned short* outstart= out;
824 unsigned short* outend;
825 const unsigned char* inend= in+*inlen;
826 unsigned int c, d;
827 int trailing;
828 unsigned char *tmp;
829 unsigned short tmp1, tmp2;
830
831 if (in == NULL) {
832 /*
833 * initialization, add the Byte Order Mark
834 */
835 if (*outlen >= 2) {
836 outb[0] = 0xFE;
837 outb[1] = 0xFF;
838 *outlen = 2;
839 *inlen = 0;
840#ifdef DEBUG_ENCODING
841 xmlGenericError(xmlGenericErrorContext,
842 "Added FEFF Byte Order Mark\n");
843#endif
844 return(2);
845 }
846 *outlen = 0;
847 *inlen = 0;
848 return(0);
849 }
850 outend = out + (*outlen / 2);
851 while (in < inend) {
852 d= *in++;
853 if (d < 0x80) { c= d; trailing= 0; }
854 else if (d < 0xC0) {
855 /* trailing byte in leading position */
856 *outlen = out - outstart;
857 *inlen = processed - in;
858 return(-2);
859 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
860 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
861 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
862 else {
863 /* no chance for this in UTF-16 */
864 *outlen = out - outstart;
865 *inlen = processed - in;
866 return(-2);
867 }
868
869 if (inend - in < trailing) {
870 break;
871 }
872
873 for ( ; trailing; trailing--) {
874 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
875 c <<= 6;
876 c |= d & 0x3F;
877 }
878
879 /* assertion: c is a single UTF-4 value */
880 if (c < 0x10000) {
881 if (out >= outend) break;
882 if (xmlLittleEndian) {
883 tmp = (unsigned char *) out;
884 *tmp = c >> 8;
885 *(tmp + 1) = c;
886 out++;
887 } else {
888 *out++ = c;
889 }
890 }
891 else if (c < 0x110000) {
892 if (out+1 >= outend) break;
893 c -= 0x10000;
894 if (xmlLittleEndian) {
895 tmp1 = 0xD800 | (c >> 10);
896 tmp = (unsigned char *) out;
897 *tmp = tmp1 >> 8;
898 *(tmp + 1) = (unsigned char) tmp1;
899 out++;
900
901 tmp2 = 0xDC00 | (c & 0x03FF);
902 tmp = (unsigned char *) out;
903 *tmp = tmp2 >> 8;
904 *(tmp + 1) = (unsigned char) tmp2;
905 out++;
906 } else {
907 *out++ = 0xD800 | (c >> 10);
908 *out++ = 0xDC00 | (c & 0x03FF);
909 }
910 }
911 else
912 break;
913 processed = in;
914 }
915 *outlen = (out - outstart) * 2;
916 *inlen = processed - in;
917 return(0);
918}
919
920/**
921 * xmlDetectCharEncoding:
922 * @in: a pointer to the first bytes of the XML entity, must be at least
923 * 4 bytes long.
924 * @len: pointer to the length of the buffer
925 *
926 * Guess the encoding of the entity using the first bytes of the entity content
927 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
928 *
929 * Returns one of the XML_CHAR_ENCODING_... values.
930 */
931xmlCharEncoding
932xmlDetectCharEncoding(const unsigned char* in, int len)
933{
934 if (len >= 4) {
935 if ((in[0] == 0x00) && (in[1] == 0x00) &&
936 (in[2] == 0x00) && (in[3] == 0x3C))
937 return(XML_CHAR_ENCODING_UCS4BE);
938 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
939 (in[2] == 0x00) && (in[3] == 0x00))
940 return(XML_CHAR_ENCODING_UCS4LE);
941 if ((in[0] == 0x00) && (in[1] == 0x00) &&
942 (in[2] == 0x3C) && (in[3] == 0x00))
943 return(XML_CHAR_ENCODING_UCS4_2143);
944 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
945 (in[2] == 0x00) && (in[3] == 0x00))
946 return(XML_CHAR_ENCODING_UCS4_3412);
947 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
948 (in[2] == 0xA7) && (in[3] == 0x94))
949 return(XML_CHAR_ENCODING_EBCDIC);
950 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
951 (in[2] == 0x78) && (in[3] == 0x6D))
952 return(XML_CHAR_ENCODING_UTF8);
953 }
954 if (len >= 2) {
955 if ((in[0] == 0xFE) && (in[1] == 0xFF))
956 return(XML_CHAR_ENCODING_UTF16BE);
957 if ((in[0] == 0xFF) && (in[1] == 0xFE))
958 return(XML_CHAR_ENCODING_UTF16LE);
959 }
960 return(XML_CHAR_ENCODING_NONE);
961}
962
963/**
964 * xmlCleanupEncodingAliases:
965 *
966 * Unregisters all aliases
967 */
968void
969xmlCleanupEncodingAliases(void) {
970 int i;
971
972 if (xmlCharEncodingAliases == NULL)
973 return;
974
975 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
976 if (xmlCharEncodingAliases[i].name != NULL)
977 xmlFree((char *) xmlCharEncodingAliases[i].name);
978 if (xmlCharEncodingAliases[i].alias != NULL)
979 xmlFree((char *) xmlCharEncodingAliases[i].alias);
980 }
981 xmlCharEncodingAliasesNb = 0;
982 xmlCharEncodingAliasesMax = 0;
983 xmlFree(xmlCharEncodingAliases);
984}
985
986/**
987 * xmlGetEncodingAlias:
988 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
989 *
990 * Lookup an encoding name for the given alias.
991 *
992 * Returns NULL if not found the original name otherwise
993 */
994const char *
995xmlGetEncodingAlias(const char *alias) {
996 int i;
997 char upper[100];
998
999 if (alias == NULL)
1000 return(NULL);
1001
1002 if (xmlCharEncodingAliases == NULL)
1003 return(NULL);
1004
1005 for (i = 0;i < 99;i++) {
1006 upper[i] = toupper(alias[i]);
1007 if (upper[i] == 0) break;
1008 }
1009 upper[i] = 0;
1010
1011 /*
1012 * Walk down the list looking for a definition of the alias
1013 */
1014 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1015 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1016 return(xmlCharEncodingAliases[i].name);
1017 }
1018 }
1019 return(NULL);
1020}
1021
1022/**
1023 * xmlAddEncodingAlias:
1024 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1025 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1026 *
1027 * Registers and alias @alias for an encoding named @name. Existing alias
1028 * will be overwritten.
1029 *
1030 * Returns 0 in case of success, -1 in case of error
1031 */
1032int
1033xmlAddEncodingAlias(const char *name, const char *alias) {
1034 int i;
1035 char upper[100];
1036
1037 if ((name == NULL) || (alias == NULL))
1038 return(-1);
1039
1040 for (i = 0;i < 99;i++) {
1041 upper[i] = toupper(alias[i]);
1042 if (upper[i] == 0) break;
1043 }
1044 upper[i] = 0;
1045
1046 if (xmlCharEncodingAliases == NULL) {
1047 xmlCharEncodingAliasesNb = 0;
1048 xmlCharEncodingAliasesMax = 20;
1049 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1050 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1051 if (xmlCharEncodingAliases == NULL)
1052 return(-1);
1053 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1054 xmlCharEncodingAliasesMax *= 2;
1055 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1056 xmlRealloc(xmlCharEncodingAliases,
1057 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1058 }
1059 /*
1060 * Walk down the list looking for a definition of the alias
1061 */
1062 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1063 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1064 /*
1065 * Replace the definition.
1066 */
1067 xmlFree((char *) xmlCharEncodingAliases[i].name);
1068 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1069 return(0);
1070 }
1071 }
1072 /*
1073 * Add the definition
1074 */
1075 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1076 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1077 xmlCharEncodingAliasesNb++;
1078 return(0);
1079}
1080
1081/**
1082 * xmlDelEncodingAlias:
1083 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1084 *
1085 * Unregisters an encoding alias @alias
1086 *
1087 * Returns 0 in case of success, -1 in case of error
1088 */
1089int
1090xmlDelEncodingAlias(const char *alias) {
1091 int i;
1092
1093 if (alias == NULL)
1094 return(-1);
1095
1096 if (xmlCharEncodingAliases == NULL)
1097 return(-1);
1098 /*
1099 * Walk down the list looking for a definition of the alias
1100 */
1101 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1102 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1103 xmlFree((char *) xmlCharEncodingAliases[i].name);
1104 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1105 xmlCharEncodingAliasesNb--;
1106 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1107 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1108 return(0);
1109 }
1110 }
1111 return(-1);
1112}
1113
1114/**
1115 * xmlParseCharEncoding:
1116 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1117 *
1118 * Conpare the string to the known encoding schemes already known. Note
1119 * that the comparison is case insensitive accordingly to the section
1120 * [XML] 4.3.3 Character Encoding in Entities.
1121 *
1122 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1123 * if not recognized.
1124 */
1125xmlCharEncoding
1126xmlParseCharEncoding(const char* name)
1127{
1128 const char *alias;
1129 char upper[500];
1130 int i;
1131
1132 if (name == NULL)
1133 return(XML_CHAR_ENCODING_NONE);
1134
1135 /*
1136 * Do the alias resolution
1137 */
1138 alias = xmlGetEncodingAlias(name);
1139 if (alias != NULL)
1140 name = alias;
1141
1142 for (i = 0;i < 499;i++) {
1143 upper[i] = toupper(name[i]);
1144 if (upper[i] == 0) break;
1145 }
1146 upper[i] = 0;
1147
1148 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1149 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1150 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1151
1152 /*
1153 * NOTE: if we were able to parse this, the endianness of UTF16 is
1154 * already found and in use
1155 */
1156 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1157 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1158
1159 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1160 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1161 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1162
1163 /*
1164 * NOTE: if we were able to parse this, the endianness of UCS4 is
1165 * already found and in use
1166 */
1167 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1168 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1169 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1170
1171
1172 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1173 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1174 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1175
1176 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1177 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1178 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1179
1180 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1181 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1182 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1183 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1184 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1185 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1186 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1187
1188 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1189 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1190 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1191
1192#ifdef DEBUG_ENCODING
1193 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1194#endif
1195 return(XML_CHAR_ENCODING_ERROR);
1196}
1197
1198/**
1199 * xmlGetCharEncodingName:
1200 * @enc: the encoding
1201 *
1202 * The "canonical" name for XML encoding.
1203 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1204 * Section 4.3.3 Character Encoding in Entities
1205 *
1206 * Returns the canonical name for the given encoding
1207 */
1208
1209const char*
1210xmlGetCharEncodingName(xmlCharEncoding enc) {
1211 switch (enc) {
1212 case XML_CHAR_ENCODING_ERROR:
1213 return(NULL);
1214 case XML_CHAR_ENCODING_NONE:
1215 return(NULL);
1216 case XML_CHAR_ENCODING_UTF8:
1217 return("UTF-8");
1218 case XML_CHAR_ENCODING_UTF16LE:
1219 return("UTF-16");
1220 case XML_CHAR_ENCODING_UTF16BE:
1221 return("UTF-16");
1222 case XML_CHAR_ENCODING_EBCDIC:
1223 return("EBCDIC");
1224 case XML_CHAR_ENCODING_UCS4LE:
1225 return("ISO-10646-UCS-4");
1226 case XML_CHAR_ENCODING_UCS4BE:
1227 return("ISO-10646-UCS-4");
1228 case XML_CHAR_ENCODING_UCS4_2143:
1229 return("ISO-10646-UCS-4");
1230 case XML_CHAR_ENCODING_UCS4_3412:
1231 return("ISO-10646-UCS-4");
1232 case XML_CHAR_ENCODING_UCS2:
1233 return("ISO-10646-UCS-2");
1234 case XML_CHAR_ENCODING_8859_1:
1235 return("ISO-8859-1");
1236 case XML_CHAR_ENCODING_8859_2:
1237 return("ISO-8859-2");
1238 case XML_CHAR_ENCODING_8859_3:
1239 return("ISO-8859-3");
1240 case XML_CHAR_ENCODING_8859_4:
1241 return("ISO-8859-4");
1242 case XML_CHAR_ENCODING_8859_5:
1243 return("ISO-8859-5");
1244 case XML_CHAR_ENCODING_8859_6:
1245 return("ISO-8859-6");
1246 case XML_CHAR_ENCODING_8859_7:
1247 return("ISO-8859-7");
1248 case XML_CHAR_ENCODING_8859_8:
1249 return("ISO-8859-8");
1250 case XML_CHAR_ENCODING_8859_9:
1251 return("ISO-8859-9");
1252 case XML_CHAR_ENCODING_2022_JP:
1253 return("ISO-2022-JP");
1254 case XML_CHAR_ENCODING_SHIFT_JIS:
1255 return("Shift-JIS");
1256 case XML_CHAR_ENCODING_EUC_JP:
1257 return("EUC-JP");
1258 case XML_CHAR_ENCODING_ASCII:
1259 return(NULL);
1260 }
1261 return(NULL);
1262}
1263
1264/****************************************************************
1265 * *
1266 * Char encoding handlers *
1267 * *
1268 ****************************************************************/
1269
1270/* the size should be growable, but it's not a big deal ... */
1271#define MAX_ENCODING_HANDLERS 50
1272static xmlCharEncodingHandlerPtr *handlers = NULL;
1273static int nbCharEncodingHandler = 0;
1274
1275/*
1276 * The default is UTF-8 for XML, that's also the default used for the
1277 * parser internals, so the default encoding handler is NULL
1278 */
1279
1280static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1281
1282/**
1283 * xmlNewCharEncodingHandler:
1284 * @name: the encoding name, in UTF-8 format (ASCII actually)
1285 * @input: the xmlCharEncodingInputFunc to read that encoding
1286 * @output: the xmlCharEncodingOutputFunc to write that encoding
1287 *
1288 * Create and registers an xmlCharEncodingHandler.
1289 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1290 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001291static xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001292xmlNewCharEncodingHandler(const char *name,
1293 xmlCharEncodingInputFunc input,
1294 xmlCharEncodingOutputFunc output) {
1295 xmlCharEncodingHandlerPtr handler;
1296 const char *alias;
1297 char upper[500];
1298 int i;
1299 char *up = 0;
1300
1301 /*
1302 * Do the alias resolution
1303 */
1304 alias = xmlGetEncodingAlias(name);
1305 if (alias != NULL)
1306 name = alias;
1307
1308 /*
1309 * Keep only the uppercase version of the encoding.
1310 */
1311 if (name == NULL) {
1312 xmlGenericError(xmlGenericErrorContext,
1313 "xmlNewCharEncodingHandler : no name !\n");
1314 return(NULL);
1315 }
1316 for (i = 0;i < 499;i++) {
1317 upper[i] = toupper(name[i]);
1318 if (upper[i] == 0) break;
1319 }
1320 upper[i] = 0;
1321 up = xmlMemStrdup(upper);
1322 if (up == NULL) {
1323 xmlGenericError(xmlGenericErrorContext,
1324 "xmlNewCharEncodingHandler : out of memory !\n");
1325 return(NULL);
1326 }
1327
1328 /*
1329 * allocate and fill-up an handler block.
1330 */
1331 handler = (xmlCharEncodingHandlerPtr)
1332 xmlMalloc(sizeof(xmlCharEncodingHandler));
1333 if (handler == NULL) {
1334 xmlGenericError(xmlGenericErrorContext,
1335 "xmlNewCharEncodingHandler : out of memory !\n");
1336 return(NULL);
1337 }
1338 handler->input = input;
1339 handler->output = output;
1340 handler->name = up;
1341
1342#ifdef LIBXML_ICONV_ENABLED
1343 handler->iconv_in = NULL;
1344 handler->iconv_out = NULL;
1345#endif /* LIBXML_ICONV_ENABLED */
1346
1347 /*
1348 * registers and returns the handler.
1349 */
1350 xmlRegisterCharEncodingHandler(handler);
1351#ifdef DEBUG_ENCODING
1352 xmlGenericError(xmlGenericErrorContext,
1353 "Registered encoding handler for %s\n", name);
1354#endif
1355 return(handler);
1356}
1357
1358/**
1359 * xmlInitCharEncodingHandlers:
1360 *
1361 * Initialize the char encoding support, it registers the default
1362 * encoding supported.
1363 * NOTE: while public, this function usually doesn't need to be called
1364 * in normal processing.
1365 */
1366void
1367xmlInitCharEncodingHandlers(void) {
1368 unsigned short int tst = 0x1234;
1369 unsigned char *ptr = (unsigned char *) &tst;
1370
1371 if (handlers != NULL) return;
1372
1373 handlers = (xmlCharEncodingHandlerPtr *)
1374 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1375
1376 if (*ptr == 0x12) xmlLittleEndian = 0;
1377 else if (*ptr == 0x34) xmlLittleEndian = 1;
1378 else xmlGenericError(xmlGenericErrorContext,
1379 "Odd problem at endianness detection\n");
1380
1381 if (handlers == NULL) {
1382 xmlGenericError(xmlGenericErrorContext,
1383 "xmlInitCharEncodingHandlers : out of memory !\n");
1384 return;
1385 }
1386 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1387 xmlUTF16LEHandler =
1388 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1389 xmlUTF16BEHandler =
1390 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1391 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1392 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1393#ifdef LIBXML_HTML_ENABLED
1394 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1395#endif
1396}
1397
1398/**
1399 * xmlCleanupCharEncodingHandlers:
1400 *
1401 * Cleanup the memory allocated for the char encoding support, it
1402 * unregisters all the encoding handlers and the aliases.
1403 */
1404void
1405xmlCleanupCharEncodingHandlers(void) {
1406 xmlCleanupEncodingAliases();
1407
1408 if (handlers == NULL) return;
1409
1410 for (;nbCharEncodingHandler > 0;) {
1411 nbCharEncodingHandler--;
1412 if (handlers[nbCharEncodingHandler] != NULL) {
1413 if (handlers[nbCharEncodingHandler]->name != NULL)
1414 xmlFree(handlers[nbCharEncodingHandler]->name);
1415 xmlFree(handlers[nbCharEncodingHandler]);
1416 }
1417 }
1418 xmlFree(handlers);
1419 handlers = NULL;
1420 nbCharEncodingHandler = 0;
1421 xmlDefaultCharEncodingHandler = NULL;
1422}
1423
1424/**
1425 * xmlRegisterCharEncodingHandler:
1426 * @handler: the xmlCharEncodingHandlerPtr handler block
1427 *
1428 * Register the char encoding handler, surprizing, isn't it ?
1429 */
1430void
1431xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1432 if (handlers == NULL) xmlInitCharEncodingHandlers();
1433 if (handler == NULL) {
1434 xmlGenericError(xmlGenericErrorContext,
1435 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1436 return;
1437 }
1438
1439 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1440 xmlGenericError(xmlGenericErrorContext,
1441 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1442 xmlGenericError(xmlGenericErrorContext,
1443 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1444 return;
1445 }
1446 handlers[nbCharEncodingHandler++] = handler;
1447}
1448
1449/**
1450 * xmlGetCharEncodingHandler:
1451 * @enc: an xmlCharEncoding value.
1452 *
1453 * Search in the registrered set the handler able to read/write that encoding.
1454 *
1455 * Returns the handler or NULL if not found
1456 */
1457xmlCharEncodingHandlerPtr
1458xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1459 xmlCharEncodingHandlerPtr handler;
1460
1461 if (handlers == NULL) xmlInitCharEncodingHandlers();
1462 switch (enc) {
1463 case XML_CHAR_ENCODING_ERROR:
1464 return(NULL);
1465 case XML_CHAR_ENCODING_NONE:
1466 return(NULL);
1467 case XML_CHAR_ENCODING_UTF8:
1468 return(NULL);
1469 case XML_CHAR_ENCODING_UTF16LE:
1470 return(xmlUTF16LEHandler);
1471 case XML_CHAR_ENCODING_UTF16BE:
1472 return(xmlUTF16BEHandler);
1473 case XML_CHAR_ENCODING_EBCDIC:
1474 handler = xmlFindCharEncodingHandler("EBCDIC");
1475 if (handler != NULL) return(handler);
1476 handler = xmlFindCharEncodingHandler("ebcdic");
1477 if (handler != NULL) return(handler);
1478 break;
1479 case XML_CHAR_ENCODING_UCS4BE:
1480 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1481 if (handler != NULL) return(handler);
1482 handler = xmlFindCharEncodingHandler("UCS-4");
1483 if (handler != NULL) return(handler);
1484 handler = xmlFindCharEncodingHandler("UCS4");
1485 if (handler != NULL) return(handler);
1486 break;
1487 case XML_CHAR_ENCODING_UCS4LE:
1488 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1489 if (handler != NULL) return(handler);
1490 handler = xmlFindCharEncodingHandler("UCS-4");
1491 if (handler != NULL) return(handler);
1492 handler = xmlFindCharEncodingHandler("UCS4");
1493 if (handler != NULL) return(handler);
1494 break;
1495 case XML_CHAR_ENCODING_UCS4_2143:
1496 break;
1497 case XML_CHAR_ENCODING_UCS4_3412:
1498 break;
1499 case XML_CHAR_ENCODING_UCS2:
1500 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1501 if (handler != NULL) return(handler);
1502 handler = xmlFindCharEncodingHandler("UCS-2");
1503 if (handler != NULL) return(handler);
1504 handler = xmlFindCharEncodingHandler("UCS2");
1505 if (handler != NULL) return(handler);
1506 break;
1507
1508 /*
1509 * We used to keep ISO Latin encodings native in the
1510 * generated data. This led to so many problems that
1511 * this has been removed. One can still change this
1512 * back by registering no-ops encoders for those
1513 */
1514 case XML_CHAR_ENCODING_8859_1:
1515 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1516 if (handler != NULL) return(handler);
1517 break;
1518 case XML_CHAR_ENCODING_8859_2:
1519 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1520 if (handler != NULL) return(handler);
1521 break;
1522 case XML_CHAR_ENCODING_8859_3:
1523 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1524 if (handler != NULL) return(handler);
1525 break;
1526 case XML_CHAR_ENCODING_8859_4:
1527 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1528 if (handler != NULL) return(handler);
1529 break;
1530 case XML_CHAR_ENCODING_8859_5:
1531 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1532 if (handler != NULL) return(handler);
1533 break;
1534 case XML_CHAR_ENCODING_8859_6:
1535 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1536 if (handler != NULL) return(handler);
1537 break;
1538 case XML_CHAR_ENCODING_8859_7:
1539 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1540 if (handler != NULL) return(handler);
1541 break;
1542 case XML_CHAR_ENCODING_8859_8:
1543 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1544 if (handler != NULL) return(handler);
1545 break;
1546 case XML_CHAR_ENCODING_8859_9:
1547 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1548 if (handler != NULL) return(handler);
1549 break;
1550
1551
1552 case XML_CHAR_ENCODING_2022_JP:
1553 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1554 if (handler != NULL) return(handler);
1555 break;
1556 case XML_CHAR_ENCODING_SHIFT_JIS:
1557 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1558 if (handler != NULL) return(handler);
1559 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1560 if (handler != NULL) return(handler);
1561 handler = xmlFindCharEncodingHandler("Shift_JIS");
1562 if (handler != NULL) return(handler);
1563 break;
1564 case XML_CHAR_ENCODING_EUC_JP:
1565 handler = xmlFindCharEncodingHandler("EUC-JP");
1566 if (handler != NULL) return(handler);
1567 break;
1568 default:
1569 break;
1570 }
1571
1572#ifdef DEBUG_ENCODING
1573 xmlGenericError(xmlGenericErrorContext,
1574 "No handler found for encoding %d\n", enc);
1575#endif
1576 return(NULL);
1577}
1578
1579/**
1580 * xmlGetCharEncodingHandler:
1581 * @enc: a string describing the char encoding.
1582 *
1583 * Search in the registrered set the handler able to read/write that encoding.
1584 *
1585 * Returns the handler or NULL if not found
1586 */
1587xmlCharEncodingHandlerPtr
1588xmlFindCharEncodingHandler(const char *name) {
1589 const char *nalias;
1590 const char *norig;
1591 xmlCharEncoding alias;
1592#ifdef LIBXML_ICONV_ENABLED
1593 xmlCharEncodingHandlerPtr enc;
1594 iconv_t icv_in, icv_out;
1595#endif /* LIBXML_ICONV_ENABLED */
1596 char upper[100];
1597 int i;
1598
1599 if (handlers == NULL) xmlInitCharEncodingHandlers();
1600 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1601 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1602
1603 /*
1604 * Do the alias resolution
1605 */
1606 norig = name;
1607 nalias = xmlGetEncodingAlias(name);
1608 if (nalias != NULL)
1609 name = nalias;
1610
1611 /*
1612 * Check first for directly registered encoding names
1613 */
1614 for (i = 0;i < 99;i++) {
1615 upper[i] = toupper(name[i]);
1616 if (upper[i] == 0) break;
1617 }
1618 upper[i] = 0;
1619
1620 for (i = 0;i < nbCharEncodingHandler; i++)
1621 if (!strcmp(upper, handlers[i]->name)) {
1622#ifdef DEBUG_ENCODING
1623 xmlGenericError(xmlGenericErrorContext,
1624 "Found registered handler for encoding %s\n", name);
1625#endif
1626 return(handlers[i]);
1627 }
1628
1629#ifdef LIBXML_ICONV_ENABLED
1630 /* check whether iconv can handle this */
1631 icv_in = iconv_open("UTF-8", name);
1632 icv_out = iconv_open(name, "UTF-8");
1633 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1634 enc = (xmlCharEncodingHandlerPtr)
1635 xmlMalloc(sizeof(xmlCharEncodingHandler));
1636 if (enc == NULL) {
1637 iconv_close(icv_in);
1638 iconv_close(icv_out);
1639 return(NULL);
1640 }
1641 enc->name = xmlMemStrdup(name);
1642 enc->input = NULL;
1643 enc->output = NULL;
1644 enc->iconv_in = icv_in;
1645 enc->iconv_out = icv_out;
1646#ifdef DEBUG_ENCODING
1647 xmlGenericError(xmlGenericErrorContext,
1648 "Found iconv handler for encoding %s\n", name);
1649#endif
1650 return enc;
1651 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1652 xmlGenericError(xmlGenericErrorContext,
1653 "iconv : problems with filters for '%s'\n", name);
1654 }
1655#endif /* LIBXML_ICONV_ENABLED */
1656
1657#ifdef DEBUG_ENCODING
1658 xmlGenericError(xmlGenericErrorContext,
1659 "No handler found for encoding %s\n", name);
1660#endif
1661
1662 /*
1663 * Fallback using the canonical names
1664 */
1665 alias = xmlParseCharEncoding(norig);
1666 if (alias != XML_CHAR_ENCODING_ERROR) {
1667 const char* canon;
1668 canon = xmlGetCharEncodingName(alias);
1669 if ((canon != NULL) && (strcmp(name, canon))) {
1670 return(xmlFindCharEncodingHandler(canon));
1671 }
1672 }
1673
1674 return(NULL);
1675}
1676
1677#ifdef LIBXML_ICONV_ENABLED
1678/**
1679 * xmlIconvWrapper:
1680 * @cd: iconv converter data structure
1681 * @out: a pointer to an array of bytes to store the result
1682 * @outlen: the length of @out
1683 * @in: a pointer to an array of ISO Latin 1 chars
1684 * @inlen: the length of @in
1685 *
1686 * Returns 0 if success, or
1687 * -1 by lack of space, or
1688 * -2 if the transcoding fails (for *in is not valid utf8 string or
1689 * the result of transformation can't fit into the encoding we want), or
1690 * -3 if there the last byte can't form a single output char.
1691 *
1692 * The value of @inlen after return is the number of octets consumed
1693 * as the return value is positive, else unpredictiable.
1694 * The value of @outlen after return is the number of ocetes consumed.
1695 */
1696static int
1697xmlIconvWrapper(iconv_t cd,
1698 unsigned char *out, int *outlen,
1699 const unsigned char *in, int *inlen) {
1700
1701 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1702 const char *icv_in = (const char *) in;
1703 char *icv_out = (char *) out;
1704 int ret;
1705
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001706 ret = iconv(cd, &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Owen Taylor3473f882001-02-23 17:55:21 +00001707 if (in != NULL) {
1708 *inlen -= icv_inlen;
1709 *outlen -= icv_outlen;
1710 } else {
1711 *inlen = 0;
1712 *outlen = 0;
1713 }
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001714 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001715#ifdef EILSEQ
1716 if (errno == EILSEQ) {
1717 return -2;
1718 } else
1719#endif
1720#ifdef E2BIG
1721 if (errno == E2BIG) {
1722 return -1;
1723 } else
1724#endif
1725#ifdef EINVAL
1726 if (errno == EINVAL) {
1727 return -3;
1728 } else
1729#endif
1730 {
1731 return -3;
1732 }
1733 }
1734 return 0;
1735}
1736#endif /* LIBXML_ICONV_ENABLED */
1737
1738/**
1739 * xmlCharEncFirstLine:
1740 * @handler: char enconding transformation data structure
1741 * @out: an xmlBuffer for the output.
1742 * @in: an xmlBuffer for the input
1743 *
1744 * Front-end for the encoding handler input function, but handle only
1745 * the very first line, i.e. limit itself to 45 chars.
1746 *
1747 * Returns the number of byte written if success, or
1748 * -1 general error
1749 * -2 if the transcoding fails (for *in is not valid utf8 string or
1750 * the result of transformation can't fit into the encoding we want), or
1751 */
1752int
1753xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1754 xmlBufferPtr in) {
1755 int ret = -2;
1756 int written;
1757 int toconv;
1758
1759 if (handler == NULL) return(-1);
1760 if (out == NULL) return(-1);
1761 if (in == NULL) return(-1);
1762
1763 written = out->size - out->use;
1764 toconv = in->use;
1765 if (toconv * 2 >= written) {
1766 xmlBufferGrow(out, toconv);
1767 written = out->size - out->use - 1;
1768 }
1769
1770 /*
1771 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1772 * 45 chars should be sufficient to reach the end of the encoding
1773 * decalration without going too far inside the document content.
1774 */
1775 written = 45;
1776
1777 if (handler->input != NULL) {
1778 ret = handler->input(&out->content[out->use], &written,
1779 in->content, &toconv);
1780 xmlBufferShrink(in, toconv);
1781 out->use += written;
1782 out->content[out->use] = 0;
1783 }
1784#ifdef LIBXML_ICONV_ENABLED
1785 else if (handler->iconv_in != NULL) {
1786 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1787 &written, in->content, &toconv);
1788 xmlBufferShrink(in, toconv);
1789 out->use += written;
1790 out->content[out->use] = 0;
1791 if (ret == -1) ret = -3;
1792 }
1793#endif /* LIBXML_ICONV_ENABLED */
1794#ifdef DEBUG_ENCODING
1795 switch (ret) {
1796 case 0:
1797 xmlGenericError(xmlGenericErrorContext,
1798 "converted %d bytes to %d bytes of input\n",
1799 toconv, written);
1800 break;
1801 case -1:
1802 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1803 toconv, written, in->use);
1804 break;
1805 case -2:
1806 xmlGenericError(xmlGenericErrorContext,
1807 "input conversion failed due to input error\n");
1808 break;
1809 case -3:
1810 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1811 toconv, written, in->use);
1812 break;
1813 default:
1814 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
1815 }
1816#endif
1817 /*
1818 * Ignore when input buffer is not on a boundary
1819 */
1820 if (ret == -3) ret = 0;
1821 if (ret == -1) ret = 0;
1822 return(ret);
1823}
1824
1825/**
1826 * xmlCharEncInFunc:
1827 * @handler: char enconding transformation data structure
1828 * @out: an xmlBuffer for the output.
1829 * @in: an xmlBuffer for the input
1830 *
1831 * Generic front-end for the encoding handler input function
1832 *
1833 * Returns the number of byte written if success, or
1834 * -1 general error
1835 * -2 if the transcoding fails (for *in is not valid utf8 string or
1836 * the result of transformation can't fit into the encoding we want), or
1837 */
1838int
1839xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1840 xmlBufferPtr in) {
1841 int ret = -2;
1842 int written;
1843 int toconv;
1844
1845 if (handler == NULL) return(-1);
1846 if (out == NULL) return(-1);
1847 if (in == NULL) return(-1);
1848
1849 toconv = in->use;
1850 if (toconv == 0)
1851 return(0);
1852 written = out->size - out->use;
1853 if (toconv * 2 >= written) {
1854 xmlBufferGrow(out, out->size + toconv * 2);
1855 written = out->size - out->use - 1;
1856 }
1857 if (handler->input != NULL) {
1858 ret = handler->input(&out->content[out->use], &written,
1859 in->content, &toconv);
1860 xmlBufferShrink(in, toconv);
1861 out->use += written;
1862 out->content[out->use] = 0;
1863 }
1864#ifdef LIBXML_ICONV_ENABLED
1865 else if (handler->iconv_in != NULL) {
1866 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1867 &written, in->content, &toconv);
1868 xmlBufferShrink(in, toconv);
1869 out->use += written;
1870 out->content[out->use] = 0;
1871 if (ret == -1) ret = -3;
1872 }
1873#endif /* LIBXML_ICONV_ENABLED */
1874 switch (ret) {
1875#ifdef DEBUG_ENCODING
1876 case 0:
1877 xmlGenericError(xmlGenericErrorContext,
1878 "converted %d bytes to %d bytes of input\n",
1879 toconv, written);
1880 break;
1881 case -1:
1882 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1883 toconv, written, in->use);
1884 break;
1885 case -3:
1886 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1887 toconv, written, in->use);
1888 break;
1889#endif
1890 case -2:
1891 xmlGenericError(xmlGenericErrorContext,
1892 "input conversion failed due to input error\n");
1893 xmlGenericError(xmlGenericErrorContext,
1894 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1895 in->content[0], in->content[1],
1896 in->content[2], in->content[3]);
1897 }
1898 /*
1899 * Ignore when input buffer is not on a boundary
1900 */
1901 if (ret == -3) ret = 0;
1902 return(ret);
1903}
1904
1905/**
1906 * xmlCharEncOutFunc:
1907 * @handler: char enconding transformation data structure
1908 * @out: an xmlBuffer for the output.
1909 * @in: an xmlBuffer for the input
1910 *
1911 * Generic front-end for the encoding handler output function
1912 * a first call with @in == NULL has to be made firs to initiate the
1913 * output in case of non-stateless encoding needing to initiate their
1914 * state or the output (like the BOM in UTF16).
1915 * In case of UTF8 sequence conversion errors for the given encoder,
1916 * the content will be automatically remapped to a CharRef sequence.
1917 *
1918 * Returns the number of byte written if success, or
1919 * -1 general error
1920 * -2 if the transcoding fails (for *in is not valid utf8 string or
1921 * the result of transformation can't fit into the encoding we want), or
1922 */
1923int
1924xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1925 xmlBufferPtr in) {
1926 int ret = -2;
1927 int written;
1928 int writtentot = 0;
1929 int toconv;
1930 int output = 0;
1931
1932 if (handler == NULL) return(-1);
1933 if (out == NULL) return(-1);
1934
1935retry:
1936
1937 written = out->size - out->use;
1938
1939 /*
1940 * First specific handling of in = NULL, i.e. the initialization call
1941 */
1942 if (in == NULL) {
1943 toconv = 0;
1944 if (handler->output != NULL) {
1945 ret = handler->output(&out->content[out->use], &written,
1946 NULL, &toconv);
1947 out->use += written;
1948 out->content[out->use] = 0;
1949 }
1950#ifdef LIBXML_ICONV_ENABLED
1951 else if (handler->iconv_out != NULL) {
1952 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1953 &written, NULL, &toconv);
1954 out->use += written;
1955 out->content[out->use] = 0;
1956 }
1957#endif /* LIBXML_ICONV_ENABLED */
1958#ifdef DEBUG_ENCODING
1959 xmlGenericError(xmlGenericErrorContext,
1960 "initialized encoder\n");
1961#endif
1962 return(0);
1963 }
1964
1965 /*
1966 * Convertion itself.
1967 */
1968 toconv = in->use;
1969 if (toconv == 0)
1970 return(0);
1971 if (toconv * 2 >= written) {
1972 xmlBufferGrow(out, toconv * 2);
1973 written = out->size - out->use - 1;
1974 }
1975 if (handler->output != NULL) {
1976 ret = handler->output(&out->content[out->use], &written,
1977 in->content, &toconv);
1978 xmlBufferShrink(in, toconv);
1979 out->use += written;
1980 writtentot += written;
1981 out->content[out->use] = 0;
1982 }
1983#ifdef LIBXML_ICONV_ENABLED
1984 else if (handler->iconv_out != NULL) {
1985 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1986 &written, in->content, &toconv);
1987 xmlBufferShrink(in, toconv);
1988 out->use += written;
1989 writtentot += written;
1990 out->content[out->use] = 0;
1991 if (ret == -1) {
1992 if (written > 0) {
1993 /*
1994 * Can be a limitation of iconv
1995 */
1996 goto retry;
1997 }
1998 ret = -3;
1999 }
2000 }
2001#endif /* LIBXML_ICONV_ENABLED */
2002 else {
2003 xmlGenericError(xmlGenericErrorContext,
2004 "xmlCharEncOutFunc: no output function !\n");
2005 return(-1);
2006 }
2007
2008 if (ret >= 0) output += ret;
2009
2010 /*
2011 * Attempt to handle error cases
2012 */
2013 switch (ret) {
2014#ifdef DEBUG_ENCODING
2015 case 0:
2016 xmlGenericError(xmlGenericErrorContext,
2017 "converted %d bytes to %d bytes of output\n",
2018 toconv, written);
2019 break;
2020 case -1:
2021 xmlGenericError(xmlGenericErrorContext,
2022 "output conversion failed by lack of space\n");
2023 break;
2024#endif
2025 case -3:
2026 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2027 toconv, written, in->use);
2028 break;
2029 case -2: {
2030 int len = in->use;
2031 const xmlChar *utf = (const xmlChar *) in->content;
2032 int cur;
2033
2034 cur = xmlGetUTF8Char(utf, &len);
2035 if (cur > 0) {
2036 xmlChar charref[20];
2037
2038#ifdef DEBUG_ENCODING
2039 xmlGenericError(xmlGenericErrorContext,
2040 "handling output conversion error\n");
2041 xmlGenericError(xmlGenericErrorContext,
2042 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2043 in->content[0], in->content[1],
2044 in->content[2], in->content[3]);
2045#endif
2046 /*
2047 * Removes the UTF8 sequence, and replace it by a charref
2048 * and continue the transcoding phase, hoping the error
2049 * did not mangle the encoder state.
2050 */
2051 sprintf((char *) charref, "&#x%X;", cur);
2052 xmlBufferShrink(in, len);
2053 xmlBufferAddHead(in, charref, -1);
2054
2055 goto retry;
2056 } else {
2057 xmlGenericError(xmlGenericErrorContext,
2058 "output conversion failed due to conv error\n");
2059 xmlGenericError(xmlGenericErrorContext,
2060 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2061 in->content[0], in->content[1],
2062 in->content[2], in->content[3]);
2063 in->content[0] = ' ';
2064 }
2065 break;
2066 }
2067 }
2068 return(ret);
2069}
2070
2071/**
2072 * xmlCharEncCloseFunc:
2073 * @handler: char enconding transformation data structure
2074 *
2075 * Generic front-end for hencoding handler close function
2076 *
2077 * Returns 0 if success, or -1 in case of error
2078 */
2079int
2080xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2081 int ret = 0;
2082 if (handler == NULL) return(-1);
2083 if (handler->name == NULL) return(-1);
2084#ifdef LIBXML_ICONV_ENABLED
2085 /*
2086 * Iconv handlers can be oused only once, free the whole block.
2087 * and the associated icon resources.
2088 */
2089 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2090 if (handler->name != NULL)
2091 xmlFree(handler->name);
2092 handler->name = NULL;
2093 if (handler->iconv_out != NULL) {
2094 if (iconv_close(handler->iconv_out))
2095 ret = -1;
2096 handler->iconv_out = NULL;
2097 }
2098 if (handler->iconv_in != NULL) {
2099 if (iconv_close(handler->iconv_in))
2100 ret = -1;
2101 handler->iconv_in = NULL;
2102 }
2103 xmlFree(handler);
2104 }
2105#endif /* LIBXML_ICONV_ENABLED */
2106#ifdef DEBUG_ENCODING
2107 if (ret)
2108 xmlGenericError(xmlGenericErrorContext,
2109 "failed to close the encoding handler\n");
2110 else
2111 xmlGenericError(xmlGenericErrorContext,
2112 "closed the encoding handler\n");
2113
2114#endif
2115 return(ret);
2116}
2117