blob: f86adf3eaf6aedf9db62b76e3615a8a3f72e2808 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
18 * Daniel.Veillard@w3.org
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Bjorn Reese70a9da52001-04-21 16:57:29 +000026#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000027
Owen Taylor3473f882001-02-23 17:55:21 +000028#include <string.h>
29
30#ifdef HAVE_CTYPE_H
31#include <ctype.h>
32#endif
33#ifdef HAVE_STDLIB_H
34#include <stdlib.h>
35#endif
Owen Taylor3473f882001-02-23 17:55:21 +000036#ifdef LIBXML_ICONV_ENABLED
37#ifdef HAVE_ERRNO_H
38#include <errno.h>
39#endif
40#endif
41#include <libxml/encoding.h>
42#include <libxml/xmlmemory.h>
43#ifdef LIBXML_HTML_ENABLED
44#include <libxml/HTMLparser.h>
45#endif
46#include <libxml/xmlerror.h>
47
48xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
49xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
50
51typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
52typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
53struct _xmlCharEncodingAlias {
54 const char *name;
55 const char *alias;
56};
57
58static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
59static int xmlCharEncodingAliasesNb = 0;
60static int xmlCharEncodingAliasesMax = 0;
61
62#ifdef LIBXML_ICONV_ENABLED
63#if 0
64#define DEBUG_ENCODING /* Define this to get encoding traces */
65#endif
66#endif
67
68static int xmlLittleEndian = 1;
69
Daniel Veillard97ac1312001-05-30 19:14:17 +000070/************************************************************************
71 * *
72 * Generic UTF8 handling routines *
73 * *
74 * From rfc2044: encoding of the Unicode values on UTF-8: *
75 * *
76 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
77 * 0000 0000-0000 007F 0xxxxxxx *
78 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
79 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
80 * *
81 * I hope we won't use values > 0xFFFF anytime soon ! *
82 * *
83 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000084
85/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000086 * xmlUTF8Strlen:
87 * @utf: a sequence of UTF-8 encoded bytes
88 *
89 * compute the lenght of an UTF8 string, it doesn't do a full UTF8
90 * checking of the content of the string.
91 *
92 * Returns the number of characters in the string or -1 in case of error
93 */
94int
Daniel Veillard97ac1312001-05-30 19:14:17 +000095xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +000096 int ret = 0;
97
98 if (utf == NULL)
99 return(-1);
100
101 while (*utf != 0) {
102 if (utf[0] & 0x80) {
103 if ((utf[1] & 0xc0) != 0x80)
104 return(-1);
105 if ((utf[0] & 0xe0) == 0xe0) {
106 if ((utf[2] & 0xc0) != 0x80)
107 return(-1);
108 if ((utf[0] & 0xf0) == 0xf0) {
109 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
110 return(-1);
111 utf += 4;
112 } else {
113 utf += 3;
114 }
115 } else {
116 utf += 2;
117 }
118 } else {
119 utf++;
120 }
121 ret++;
122 }
123 return(ret);
124}
125
126/**
Owen Taylor3473f882001-02-23 17:55:21 +0000127 * xmlGetUTF8Char:
128 * @utf: a sequence of UTF-8 encoded bytes
129 * @len: a pointer to @bytes len
130 *
131 * Read one UTF8 Char from @utf
132 *
133 * Returns the char value or -1 in case of error and update @len with the
134 * number of bytes used
135 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000136static int
Owen Taylor3473f882001-02-23 17:55:21 +0000137xmlGetUTF8Char(const unsigned char *utf, int *len) {
138 unsigned int c;
139
140 if (utf == NULL)
141 goto error;
142 if (len == NULL)
143 goto error;
144 if (*len < 1)
145 goto error;
146
147 c = utf[0];
148 if (c & 0x80) {
149 if (*len < 2)
150 goto error;
151 if ((utf[1] & 0xc0) != 0x80)
152 goto error;
153 if ((c & 0xe0) == 0xe0) {
154 if (*len < 3)
155 goto error;
156 if ((utf[2] & 0xc0) != 0x80)
157 goto error;
158 if ((c & 0xf0) == 0xf0) {
159 if (*len < 4)
160 goto error;
161 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
162 goto error;
163 *len = 4;
164 /* 4-byte code */
165 c = (utf[0] & 0x7) << 18;
166 c |= (utf[1] & 0x3f) << 12;
167 c |= (utf[2] & 0x3f) << 6;
168 c |= utf[3] & 0x3f;
169 } else {
170 /* 3-byte code */
171 *len = 3;
172 c = (utf[0] & 0xf) << 12;
173 c |= (utf[1] & 0x3f) << 6;
174 c |= utf[2] & 0x3f;
175 }
176 } else {
177 /* 2-byte code */
178 *len = 2;
179 c = (utf[0] & 0x1f) << 6;
180 c |= utf[1] & 0x3f;
181 }
182 } else {
183 /* 1-byte code */
184 *len = 1;
185 }
186 return(c);
187
188error:
189 *len = 0;
190 return(-1);
191}
192
193/**
194 * xmlCheckUTF8: Check utf-8 string for legality.
195 * @utf: Pointer to putative utf-8 encoded string.
196 *
197 * Checks @utf for being valid utf-8. @utf is assumed to be
198 * null-terminated. This function is not super-strict, as it will
199 * allow longer utf-8 sequences than necessary. Note that Java is
200 * capable of producing these sequences if provoked. Also note, this
201 * routine checks for the 4-byte maxiumum size, but does not check for
202 * 0x10ffff maximum value.
203 *
204 * Return value: true if @utf is valid.
205 **/
206int
207xmlCheckUTF8(const unsigned char *utf)
208{
209 int ix;
210 unsigned char c;
211
212 for (ix = 0; (c = utf[ix]);) {
213 if (c & 0x80) {
214 if ((utf[ix + 1] & 0xc0) != 0x80)
215 return(0);
216 if ((c & 0xe0) == 0xe0) {
217 if ((utf[ix + 2] & 0xc0) != 0x80)
218 return(0);
219 if ((c & 0xf0) == 0xf0) {
220 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
221 return(0);
222 ix += 4;
223 /* 4-byte code */
224 } else
225 /* 3-byte code */
226 ix += 3;
227 } else
228 /* 2-byte code */
229 ix += 2;
230 } else
231 /* 1-byte code */
232 ix++;
233 }
234 return(1);
235}
236
237/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000238 * xmlUTF8Strsize:
239 * @utf: a sequence of UTF-8 encoded bytes
240 * @len: the number of characters in the array
241 *
242 * storage size of an UTF8 string
243 *
244 * Returns the storage size of
245 * the first 'len' characters of ARRAY
246 *
247 */
248
249int
250xmlUTF8Strsize(const xmlChar *utf, int len) {
251 const xmlChar *ptr=utf;
252 xmlChar ch;
253
254 if (len <= 0)
255 return(0);
256
257 while ( len-- > 0) {
258 if ( !*ptr )
259 break;
260 if ( (ch = *ptr++) & 0x80)
261 while ( (ch<<=1) & 0x80 )
262 ptr++;
263 }
264 return (ptr - utf);
265}
266
267
268/**
269 * xmlUTF8Strndup:
270 * @utf: the input UTF8 *
271 * @len: the len of @utf (in chars)
272 *
273 * a strndup for array of UTF8's
274 *
275 * Returns a new UTF8 * or NULL
276 */
277xmlChar *
278xmlUTF8Strndup(const xmlChar *utf, int len) {
279 xmlChar *ret;
280 int i;
281
282 if ((utf == NULL) || (len < 0)) return(NULL);
283 i = xmlUTF8Strsize(utf, len);
284 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
285 if (ret == NULL) {
286 xmlGenericError(xmlGenericErrorContext,
287 "malloc of %ld byte failed\n",
288 (len + 1) * (long)sizeof(xmlChar));
289 return(NULL);
290 }
291 memcpy(ret, utf, i * sizeof(xmlChar));
292 ret[i] = 0;
293 return(ret);
294}
295
296/**
297 * xmlUTF8Strpos:
298 * @utf: the input UTF8 *
299 * @pos: the position of the desired UTF8 char (in chars)
300 *
301 * a function to provide the equivalent of fetching a
302 * character from a string array
303 *
304 * Returns a pointer to the UTF8 character or NULL
305 */
306xmlChar *
307xmlUTF8Strpos(const xmlChar *utf, int pos) {
308 xmlChar ch;
309
310 if (utf == NULL) return(NULL);
311 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
312 return(NULL);
313 while (pos--) {
314 if ((ch=*utf++) == 0) return(NULL);
315 if ( ch & 0x80 ) {
316 /* if not simple ascii, verify proper format */
317 if ( (ch & 0xc0) != 0xc0 )
318 return(NULL);
319 /* then skip over remaining bytes for this char */
320 while ( (ch <<= 1) & 0x80 )
321 if ( (*utf++ & 0xc0) != 0x80 )
322 return(NULL);
323 }
324 }
325 return((xmlChar *)utf);
326}
327
328/**
329 * xmlUTF8Strloc:
330 * @utf: the input UTF8 *
331 * @utfchar: the UTF8 character to be found
332 *
333 * a function to provide relative location of a UTF8 char
334 *
335 * Returns the relative character position of the desired char
336 * or -1 if not found
337 */
338int
339xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
340 int i, size;
341 xmlChar ch;
342
343 if (utf==NULL || utfchar==NULL) return -1;
344 size = xmlUTF8Strsize(utfchar, 1);
345 for(i=0; (ch=*utf) != 0; i++) {
346 if (xmlStrncmp(utf, utfchar, size)==0)
347 return(i);
348 utf++;
349 if ( ch & 0x80 ) {
350 /* if not simple ascii, verify proper format */
351 if ( (ch & 0xc0) != 0xc0 )
352 return(-1);
353 /* then skip over remaining bytes for this char */
354 while ( (ch <<= 1) & 0x80 )
355 if ( (*utf++ & 0xc0) != 0x80 )
356 return(-1);
357 }
358 }
359
360 return(-1);
361}
362/**
363 * xmlUTF8Strsub:
364 * @utf: a sequence of UTF-8 encoded bytes
365 *
366 * @start: relative pos of first char
367 * @len: total number to copy
368 *
369 * Note: positions are given in units of UTF-8 chars
370 *
371 * Returns a pointer to a newly created string
372 * or NULL if any problem
373 */
374
375xmlChar *
376xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
377 int i;
378 xmlChar ch;
379
380 if (utf == NULL) return(NULL);
381 if (start < 0) return(NULL);
382 if (len < 0) return(NULL);
383
384 /*
385 * Skip over any leading chars
386 */
387 for (i = 0;i < start;i++) {
388 if ((ch=*utf++) == 0) return(NULL);
389 if ( ch & 0x80 ) {
390 /* if not simple ascii, verify proper format */
391 if ( (ch & 0xc0) != 0xc0 )
392 return(NULL);
393 /* then skip over remaining bytes for this char */
394 while ( (ch <<= 1) & 0x80 )
395 if ( (*utf++ & 0xc0) != 0x80 )
396 return(NULL);
397 }
398 }
399
400 return(xmlUTF8Strndup(utf, len));
401}
402
403/************************************************************************
404 * *
405 * Conversions To/From UTF8 encoding *
406 * *
407 ************************************************************************/
408
409/**
Owen Taylor3473f882001-02-23 17:55:21 +0000410 * asciiToUTF8:
411 * @out: a pointer to an array of bytes to store the result
412 * @outlen: the length of @out
413 * @in: a pointer to an array of ASCII chars
414 * @inlen: the length of @in
415 *
416 * Take a block of ASCII chars in and try to convert it to an UTF-8
417 * block of chars out.
418 * Returns 0 if success, or -1 otherwise
419 * The value of @inlen after return is the number of octets consumed
420 * as the return value is positive, else unpredictiable.
421 * The value of @outlen after return is the number of ocetes consumed.
422 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000423static int
Owen Taylor3473f882001-02-23 17:55:21 +0000424asciiToUTF8(unsigned char* out, int *outlen,
425 const unsigned char* in, int *inlen) {
426 unsigned char* outstart = out;
427 const unsigned char* base = in;
428 const unsigned char* processed = in;
429 unsigned char* outend = out + *outlen;
430 const unsigned char* inend;
431 unsigned int c;
432 int bits;
433
434 inend = in + (*inlen);
435 while ((in < inend) && (out - outstart + 5 < *outlen)) {
436 c= *in++;
437
438 /* assertion: c is a single UTF-4 value */
439 if (out >= outend)
440 break;
441 if (c < 0x80) { *out++= c; bits= -6; }
442 else {
443 *outlen = out - outstart;
444 *inlen = processed - base;
445 return(-1);
446 }
447
448 for ( ; bits >= 0; bits-= 6) {
449 if (out >= outend)
450 break;
451 *out++= ((c >> bits) & 0x3F) | 0x80;
452 }
453 processed = (const unsigned char*) in;
454 }
455 *outlen = out - outstart;
456 *inlen = processed - base;
457 return(0);
458}
459
460/**
461 * UTF8Toascii:
462 * @out: a pointer to an array of bytes to store the result
463 * @outlen: the length of @out
464 * @in: a pointer to an array of UTF-8 chars
465 * @inlen: the length of @in
466 *
467 * Take a block of UTF-8 chars in and try to convert it to an ASCII
468 * block of chars out.
469 *
470 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
471 * The value of @inlen after return is the number of octets consumed
472 * as the return value is positive, else unpredictiable.
473 * The value of @outlen after return is the number of ocetes consumed.
474 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000475static int
Owen Taylor3473f882001-02-23 17:55:21 +0000476UTF8Toascii(unsigned char* out, int *outlen,
477 const unsigned char* in, int *inlen) {
478 const unsigned char* processed = in;
479 const unsigned char* outend;
480 const unsigned char* outstart = out;
481 const unsigned char* instart = in;
482 const unsigned char* inend;
483 unsigned int c, d;
484 int trailing;
485
486 if (in == NULL) {
487 /*
488 * initialization nothing to do
489 */
490 *outlen = 0;
491 *inlen = 0;
492 return(0);
493 }
494 inend = in + (*inlen);
495 outend = out + (*outlen);
496 while (in < inend) {
497 d = *in++;
498 if (d < 0x80) { c= d; trailing= 0; }
499 else if (d < 0xC0) {
500 /* trailing byte in leading position */
501 *outlen = out - outstart;
502 *inlen = processed - instart;
503 return(-2);
504 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
505 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
506 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
507 else {
508 /* no chance for this in Ascii */
509 *outlen = out - outstart;
510 *inlen = processed - instart;
511 return(-2);
512 }
513
514 if (inend - in < trailing) {
515 break;
516 }
517
518 for ( ; trailing; trailing--) {
519 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
520 break;
521 c <<= 6;
522 c |= d & 0x3F;
523 }
524
525 /* assertion: c is a single UTF-4 value */
526 if (c < 0x80) {
527 if (out >= outend)
528 break;
529 *out++ = c;
530 } else {
531 /* no chance for this in Ascii */
532 *outlen = out - outstart;
533 *inlen = processed - instart;
534 return(-2);
535 }
536 processed = in;
537 }
538 *outlen = out - outstart;
539 *inlen = processed - instart;
540 return(0);
541}
542
543/**
544 * isolat1ToUTF8:
545 * @out: a pointer to an array of bytes to store the result
546 * @outlen: the length of @out
547 * @in: a pointer to an array of ISO Latin 1 chars
548 * @inlen: the length of @in
549 *
550 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
551 * block of chars out.
552 * Returns 0 if success, or -1 otherwise
553 * The value of @inlen after return is the number of octets consumed
554 * as the return value is positive, else unpredictiable.
555 * The value of @outlen after return is the number of ocetes consumed.
556 */
557int
558isolat1ToUTF8(unsigned char* out, int *outlen,
559 const unsigned char* in, int *inlen) {
560 unsigned char* outstart = out;
561 const unsigned char* base = in;
562 const unsigned char* processed = in;
563 unsigned char* outend = out + *outlen;
564 const unsigned char* inend;
565 unsigned int c;
Owen Taylor3473f882001-02-23 17:55:21 +0000566
567 inend = in + (*inlen);
Daniel Veillard02141ea2001-04-30 11:46:40 +0000568 while (in < inend) {
569 c = *in++;
Owen Taylor3473f882001-02-23 17:55:21 +0000570
Owen Taylor3473f882001-02-23 17:55:21 +0000571 if (out >= outend)
572 break;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000573
574 if (c < 0x80) {
575 *out++ = c;
576 processed++;
577 continue;
578 } else {
579 *out++= ((c >> 6) & 0x1F) | 0xC0;
Owen Taylor3473f882001-02-23 17:55:21 +0000580 if (out >= outend)
Daniel Veillard02141ea2001-04-30 11:46:40 +0000581 break;
582 *out++= (c & 0x3F) | 0x80;
583 processed++;
Owen Taylor3473f882001-02-23 17:55:21 +0000584 }
Owen Taylor3473f882001-02-23 17:55:21 +0000585 }
586 *outlen = out - outstart;
587 *inlen = processed - base;
588 return(0);
589}
590
591/**
592 * UTF8Toisolat1:
593 * @out: a pointer to an array of bytes to store the result
594 * @outlen: the length of @out
595 * @in: a pointer to an array of UTF-8 chars
596 * @inlen: the length of @in
597 *
598 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
599 * block of chars out.
600 *
601 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
602 * The value of @inlen after return is the number of octets consumed
603 * as the return value is positive, else unpredictiable.
604 * The value of @outlen after return is the number of ocetes consumed.
605 */
606int
607UTF8Toisolat1(unsigned char* out, int *outlen,
608 const unsigned char* in, int *inlen) {
609 const unsigned char* processed = in;
610 const unsigned char* outend;
611 const unsigned char* outstart = out;
612 const unsigned char* instart = in;
613 const unsigned char* inend;
614 unsigned int c, d;
615 int trailing;
616
617 if (in == NULL) {
618 /*
619 * initialization nothing to do
620 */
621 *outlen = 0;
622 *inlen = 0;
623 return(0);
624 }
625 inend = in + (*inlen);
626 outend = out + (*outlen);
627 while (in < inend) {
628 d = *in++;
629 if (d < 0x80) { c= d; trailing= 0; }
630 else if (d < 0xC0) {
631 /* trailing byte in leading position */
632 *outlen = out - outstart;
633 *inlen = processed - instart;
634 return(-2);
635 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
636 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
637 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
638 else {
639 /* no chance for this in IsoLat1 */
640 *outlen = out - outstart;
641 *inlen = processed - instart;
642 return(-2);
643 }
644
645 if (inend - in < trailing) {
646 break;
647 }
648
649 for ( ; trailing; trailing--) {
650 if (in >= inend)
651 break;
652 if (((d= *in++) & 0xC0) != 0x80) {
653 *outlen = out - outstart;
654 *inlen = processed - instart;
655 return(-2);
656 }
657 c <<= 6;
658 c |= d & 0x3F;
659 }
660
661 /* assertion: c is a single UTF-4 value */
662 if (c <= 0xFF) {
663 if (out >= outend)
664 break;
665 *out++ = c;
666 } else {
667 /* no chance for this in IsoLat1 */
668 *outlen = out - outstart;
669 *inlen = processed - instart;
670 return(-2);
671 }
672 processed = in;
673 }
674 *outlen = out - outstart;
675 *inlen = processed - instart;
676 return(0);
677}
678
679/**
680 * UTF16LEToUTF8:
681 * @out: a pointer to an array of bytes to store the result
682 * @outlen: the length of @out
683 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
684 * @inlenb: the length of @in in UTF-16LE chars
685 *
686 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
687 * block of chars out. This function assume the endian properity
688 * is the same between the native type of this machine and the
689 * inputed one.
690 *
691 * Returns the number of byte written, or -1 by lack of space, or -2
692 * if the transcoding fails (for *in is not valid utf16 string)
693 * The value of *inlen after return is the number of octets consumed
694 * as the return value is positive, else unpredictiable.
695 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000696static int
Owen Taylor3473f882001-02-23 17:55:21 +0000697UTF16LEToUTF8(unsigned char* out, int *outlen,
698 const unsigned char* inb, int *inlenb)
699{
700 unsigned char* outstart = out;
701 const unsigned char* processed = inb;
702 unsigned char* outend = out + *outlen;
703 unsigned short* in = (unsigned short*) inb;
704 unsigned short* inend;
705 unsigned int c, d, inlen;
706 unsigned char *tmp;
707 int bits;
708
709 if ((*inlenb % 2) == 1)
710 (*inlenb)--;
711 inlen = *inlenb / 2;
712 inend = in + inlen;
713 while ((in < inend) && (out - outstart + 5 < *outlen)) {
714 if (xmlLittleEndian) {
715 c= *in++;
716 } else {
717 tmp = (unsigned char *) in;
718 c = *tmp++;
719 c = c | (((unsigned int)*tmp) << 8);
720 in++;
721 }
722 if ((c & 0xFC00) == 0xD800) { /* surrogates */
723 if (in >= inend) { /* (in > inend) shouldn't happens */
724 break;
725 }
726 if (xmlLittleEndian) {
727 d = *in++;
728 } else {
729 tmp = (unsigned char *) in;
730 d = *tmp++;
731 d = d | (((unsigned int)*tmp) << 8);
732 in++;
733 }
734 if ((d & 0xFC00) == 0xDC00) {
735 c &= 0x03FF;
736 c <<= 10;
737 c |= d & 0x03FF;
738 c += 0x10000;
739 }
740 else {
741 *outlen = out - outstart;
742 *inlenb = processed - inb;
743 return(-2);
744 }
745 }
746
747 /* assertion: c is a single UTF-4 value */
748 if (out >= outend)
749 break;
750 if (c < 0x80) { *out++= c; bits= -6; }
751 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
752 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
753 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
754
755 for ( ; bits >= 0; bits-= 6) {
756 if (out >= outend)
757 break;
758 *out++= ((c >> bits) & 0x3F) | 0x80;
759 }
760 processed = (const unsigned char*) in;
761 }
762 *outlen = out - outstart;
763 *inlenb = processed - inb;
764 return(0);
765}
766
767/**
768 * UTF8ToUTF16LE:
769 * @outb: a pointer to an array of bytes to store the result
770 * @outlen: the length of @outb
771 * @in: a pointer to an array of UTF-8 chars
772 * @inlen: the length of @in
773 *
774 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
775 * block of chars out.
776 *
777 * Returns the number of byte written, or -1 by lack of space, or -2
778 * if the transcoding failed.
779 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000780static int
Owen Taylor3473f882001-02-23 17:55:21 +0000781UTF8ToUTF16LE(unsigned char* outb, int *outlen,
782 const unsigned char* in, int *inlen)
783{
784 unsigned short* out = (unsigned short*) outb;
785 const unsigned char* processed = in;
786 unsigned short* outstart= out;
787 unsigned short* outend;
788 const unsigned char* inend= in+*inlen;
789 unsigned int c, d;
790 int trailing;
791 unsigned char *tmp;
792 unsigned short tmp1, tmp2;
793
794 if (in == NULL) {
795 /*
796 * initialization, add the Byte Order Mark
797 */
798 if (*outlen >= 2) {
799 outb[0] = 0xFF;
800 outb[1] = 0xFE;
801 *outlen = 2;
802 *inlen = 0;
803#ifdef DEBUG_ENCODING
804 xmlGenericError(xmlGenericErrorContext,
805 "Added FFFE Byte Order Mark\n");
806#endif
807 return(2);
808 }
809 *outlen = 0;
810 *inlen = 0;
811 return(0);
812 }
813 outend = out + (*outlen / 2);
814 while (in < inend) {
815 d= *in++;
816 if (d < 0x80) { c= d; trailing= 0; }
817 else if (d < 0xC0) {
818 /* trailing byte in leading position */
819 *outlen = (out - outstart) * 2;
820 *inlen = processed - in;
821 return(-2);
822 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
823 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
824 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
825 else {
826 /* no chance for this in UTF-16 */
827 *outlen = (out - outstart) * 2;
828 *inlen = processed - in;
829 return(-2);
830 }
831
832 if (inend - in < trailing) {
833 break;
834 }
835
836 for ( ; trailing; trailing--) {
837 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
838 break;
839 c <<= 6;
840 c |= d & 0x3F;
841 }
842
843 /* assertion: c is a single UTF-4 value */
844 if (c < 0x10000) {
845 if (out >= outend)
846 break;
847 if (xmlLittleEndian) {
848 *out++ = c;
849 } else {
850 tmp = (unsigned char *) out;
851 *tmp = c ;
852 *(tmp + 1) = c >> 8 ;
853 out++;
854 }
855 }
856 else if (c < 0x110000) {
857 if (out+1 >= outend)
858 break;
859 c -= 0x10000;
860 if (xmlLittleEndian) {
861 *out++ = 0xD800 | (c >> 10);
862 *out++ = 0xDC00 | (c & 0x03FF);
863 } else {
864 tmp1 = 0xD800 | (c >> 10);
865 tmp = (unsigned char *) out;
866 *tmp = (unsigned char) tmp1;
867 *(tmp + 1) = tmp1 >> 8;
868 out++;
869
870 tmp2 = 0xDC00 | (c & 0x03FF);
871 tmp = (unsigned char *) out;
872 *tmp = (unsigned char) tmp2;
873 *(tmp + 1) = tmp2 >> 8;
874 out++;
875 }
876 }
877 else
878 break;
879 processed = in;
880 }
881 *outlen = (out - outstart) * 2;
882 *inlen = processed - in;
883 return(0);
884}
885
886/**
887 * UTF16BEToUTF8:
888 * @out: a pointer to an array of bytes to store the result
889 * @outlen: the length of @out
890 * @inb: a pointer to an array of UTF-16 passwd as a byte array
891 * @inlenb: the length of @in in UTF-16 chars
892 *
893 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
894 * block of chars out. This function assume the endian properity
895 * is the same between the native type of this machine and the
896 * inputed one.
897 *
898 * Returns the number of byte written, or -1 by lack of space, or -2
899 * if the transcoding fails (for *in is not valid utf16 string)
900 * The value of *inlen after return is the number of octets consumed
901 * as the return value is positive, else unpredictiable.
902 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000903static int
Owen Taylor3473f882001-02-23 17:55:21 +0000904UTF16BEToUTF8(unsigned char* out, int *outlen,
905 const unsigned char* inb, int *inlenb)
906{
907 unsigned char* outstart = out;
908 const unsigned char* processed = inb;
909 unsigned char* outend = out + *outlen;
910 unsigned short* in = (unsigned short*) inb;
911 unsigned short* inend;
912 unsigned int c, d, inlen;
913 unsigned char *tmp;
914 int bits;
915
916 if ((*inlenb % 2) == 1)
917 (*inlenb)--;
918 inlen = *inlenb / 2;
919 inend= in + inlen;
920 while (in < inend) {
921 if (xmlLittleEndian) {
922 tmp = (unsigned char *) in;
923 c = *tmp++;
924 c = c << 8;
925 c = c | (unsigned int) *tmp;
926 in++;
927 } else {
928 c= *in++;
929 }
930 if ((c & 0xFC00) == 0xD800) { /* surrogates */
931 if (in >= inend) { /* (in > inend) shouldn't happens */
932 *outlen = out - outstart;
933 *inlenb = processed - inb;
934 return(-2);
935 }
936 if (xmlLittleEndian) {
937 tmp = (unsigned char *) in;
938 d = *tmp++;
939 d = d << 8;
940 d = d | (unsigned int) *tmp;
941 in++;
942 } else {
943 d= *in++;
944 }
945 if ((d & 0xFC00) == 0xDC00) {
946 c &= 0x03FF;
947 c <<= 10;
948 c |= d & 0x03FF;
949 c += 0x10000;
950 }
951 else {
952 *outlen = out - outstart;
953 *inlenb = processed - inb;
954 return(-2);
955 }
956 }
957
958 /* assertion: c is a single UTF-4 value */
959 if (out >= outend)
960 break;
961 if (c < 0x80) { *out++= c; bits= -6; }
962 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
963 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
964 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
965
966 for ( ; bits >= 0; bits-= 6) {
967 if (out >= outend)
968 break;
969 *out++= ((c >> bits) & 0x3F) | 0x80;
970 }
971 processed = (const unsigned char*) in;
972 }
973 *outlen = out - outstart;
974 *inlenb = processed - inb;
975 return(0);
976}
977
978/**
979 * UTF8ToUTF16BE:
980 * @outb: a pointer to an array of bytes to store the result
981 * @outlen: the length of @outb
982 * @in: a pointer to an array of UTF-8 chars
983 * @inlen: the length of @in
984 *
985 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
986 * block of chars out.
987 *
988 * Returns the number of byte written, or -1 by lack of space, or -2
989 * if the transcoding failed.
990 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000991static int
Owen Taylor3473f882001-02-23 17:55:21 +0000992UTF8ToUTF16BE(unsigned char* outb, int *outlen,
993 const unsigned char* in, int *inlen)
994{
995 unsigned short* out = (unsigned short*) outb;
996 const unsigned char* processed = in;
997 unsigned short* outstart= out;
998 unsigned short* outend;
999 const unsigned char* inend= in+*inlen;
1000 unsigned int c, d;
1001 int trailing;
1002 unsigned char *tmp;
1003 unsigned short tmp1, tmp2;
1004
1005 if (in == NULL) {
1006 /*
1007 * initialization, add the Byte Order Mark
1008 */
1009 if (*outlen >= 2) {
1010 outb[0] = 0xFE;
1011 outb[1] = 0xFF;
1012 *outlen = 2;
1013 *inlen = 0;
1014#ifdef DEBUG_ENCODING
1015 xmlGenericError(xmlGenericErrorContext,
1016 "Added FEFF Byte Order Mark\n");
1017#endif
1018 return(2);
1019 }
1020 *outlen = 0;
1021 *inlen = 0;
1022 return(0);
1023 }
1024 outend = out + (*outlen / 2);
1025 while (in < inend) {
1026 d= *in++;
1027 if (d < 0x80) { c= d; trailing= 0; }
1028 else if (d < 0xC0) {
1029 /* trailing byte in leading position */
1030 *outlen = out - outstart;
1031 *inlen = processed - in;
1032 return(-2);
1033 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1034 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1035 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1036 else {
1037 /* no chance for this in UTF-16 */
1038 *outlen = out - outstart;
1039 *inlen = processed - in;
1040 return(-2);
1041 }
1042
1043 if (inend - in < trailing) {
1044 break;
1045 }
1046
1047 for ( ; trailing; trailing--) {
1048 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1049 c <<= 6;
1050 c |= d & 0x3F;
1051 }
1052
1053 /* assertion: c is a single UTF-4 value */
1054 if (c < 0x10000) {
1055 if (out >= outend) break;
1056 if (xmlLittleEndian) {
1057 tmp = (unsigned char *) out;
1058 *tmp = c >> 8;
1059 *(tmp + 1) = c;
1060 out++;
1061 } else {
1062 *out++ = c;
1063 }
1064 }
1065 else if (c < 0x110000) {
1066 if (out+1 >= outend) break;
1067 c -= 0x10000;
1068 if (xmlLittleEndian) {
1069 tmp1 = 0xD800 | (c >> 10);
1070 tmp = (unsigned char *) out;
1071 *tmp = tmp1 >> 8;
1072 *(tmp + 1) = (unsigned char) tmp1;
1073 out++;
1074
1075 tmp2 = 0xDC00 | (c & 0x03FF);
1076 tmp = (unsigned char *) out;
1077 *tmp = tmp2 >> 8;
1078 *(tmp + 1) = (unsigned char) tmp2;
1079 out++;
1080 } else {
1081 *out++ = 0xD800 | (c >> 10);
1082 *out++ = 0xDC00 | (c & 0x03FF);
1083 }
1084 }
1085 else
1086 break;
1087 processed = in;
1088 }
1089 *outlen = (out - outstart) * 2;
1090 *inlen = processed - in;
1091 return(0);
1092}
1093
Daniel Veillard97ac1312001-05-30 19:14:17 +00001094/************************************************************************
1095 * *
1096 * Generic encoding handling routines *
1097 * *
1098 ************************************************************************/
1099
Owen Taylor3473f882001-02-23 17:55:21 +00001100/**
1101 * xmlDetectCharEncoding:
1102 * @in: a pointer to the first bytes of the XML entity, must be at least
1103 * 4 bytes long.
1104 * @len: pointer to the length of the buffer
1105 *
1106 * Guess the encoding of the entity using the first bytes of the entity content
1107 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1108 *
1109 * Returns one of the XML_CHAR_ENCODING_... values.
1110 */
1111xmlCharEncoding
1112xmlDetectCharEncoding(const unsigned char* in, int len)
1113{
1114 if (len >= 4) {
1115 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1116 (in[2] == 0x00) && (in[3] == 0x3C))
1117 return(XML_CHAR_ENCODING_UCS4BE);
1118 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1119 (in[2] == 0x00) && (in[3] == 0x00))
1120 return(XML_CHAR_ENCODING_UCS4LE);
1121 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1122 (in[2] == 0x3C) && (in[3] == 0x00))
1123 return(XML_CHAR_ENCODING_UCS4_2143);
1124 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1125 (in[2] == 0x00) && (in[3] == 0x00))
1126 return(XML_CHAR_ENCODING_UCS4_3412);
1127 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1128 (in[2] == 0xA7) && (in[3] == 0x94))
1129 return(XML_CHAR_ENCODING_EBCDIC);
1130 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1131 (in[2] == 0x78) && (in[3] == 0x6D))
1132 return(XML_CHAR_ENCODING_UTF8);
1133 }
1134 if (len >= 2) {
1135 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1136 return(XML_CHAR_ENCODING_UTF16BE);
1137 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1138 return(XML_CHAR_ENCODING_UTF16LE);
1139 }
1140 return(XML_CHAR_ENCODING_NONE);
1141}
1142
1143/**
1144 * xmlCleanupEncodingAliases:
1145 *
1146 * Unregisters all aliases
1147 */
1148void
1149xmlCleanupEncodingAliases(void) {
1150 int i;
1151
1152 if (xmlCharEncodingAliases == NULL)
1153 return;
1154
1155 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1156 if (xmlCharEncodingAliases[i].name != NULL)
1157 xmlFree((char *) xmlCharEncodingAliases[i].name);
1158 if (xmlCharEncodingAliases[i].alias != NULL)
1159 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1160 }
1161 xmlCharEncodingAliasesNb = 0;
1162 xmlCharEncodingAliasesMax = 0;
1163 xmlFree(xmlCharEncodingAliases);
1164}
1165
1166/**
1167 * xmlGetEncodingAlias:
1168 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1169 *
1170 * Lookup an encoding name for the given alias.
1171 *
1172 * Returns NULL if not found the original name otherwise
1173 */
1174const char *
1175xmlGetEncodingAlias(const char *alias) {
1176 int i;
1177 char upper[100];
1178
1179 if (alias == NULL)
1180 return(NULL);
1181
1182 if (xmlCharEncodingAliases == NULL)
1183 return(NULL);
1184
1185 for (i = 0;i < 99;i++) {
1186 upper[i] = toupper(alias[i]);
1187 if (upper[i] == 0) break;
1188 }
1189 upper[i] = 0;
1190
1191 /*
1192 * Walk down the list looking for a definition of the alias
1193 */
1194 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1195 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1196 return(xmlCharEncodingAliases[i].name);
1197 }
1198 }
1199 return(NULL);
1200}
1201
1202/**
1203 * xmlAddEncodingAlias:
1204 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1205 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1206 *
1207 * Registers and alias @alias for an encoding named @name. Existing alias
1208 * will be overwritten.
1209 *
1210 * Returns 0 in case of success, -1 in case of error
1211 */
1212int
1213xmlAddEncodingAlias(const char *name, const char *alias) {
1214 int i;
1215 char upper[100];
1216
1217 if ((name == NULL) || (alias == NULL))
1218 return(-1);
1219
1220 for (i = 0;i < 99;i++) {
1221 upper[i] = toupper(alias[i]);
1222 if (upper[i] == 0) break;
1223 }
1224 upper[i] = 0;
1225
1226 if (xmlCharEncodingAliases == NULL) {
1227 xmlCharEncodingAliasesNb = 0;
1228 xmlCharEncodingAliasesMax = 20;
1229 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1230 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1231 if (xmlCharEncodingAliases == NULL)
1232 return(-1);
1233 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1234 xmlCharEncodingAliasesMax *= 2;
1235 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1236 xmlRealloc(xmlCharEncodingAliases,
1237 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1238 }
1239 /*
1240 * Walk down the list looking for a definition of the alias
1241 */
1242 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1243 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1244 /*
1245 * Replace the definition.
1246 */
1247 xmlFree((char *) xmlCharEncodingAliases[i].name);
1248 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1249 return(0);
1250 }
1251 }
1252 /*
1253 * Add the definition
1254 */
1255 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1256 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1257 xmlCharEncodingAliasesNb++;
1258 return(0);
1259}
1260
1261/**
1262 * xmlDelEncodingAlias:
1263 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1264 *
1265 * Unregisters an encoding alias @alias
1266 *
1267 * Returns 0 in case of success, -1 in case of error
1268 */
1269int
1270xmlDelEncodingAlias(const char *alias) {
1271 int i;
1272
1273 if (alias == NULL)
1274 return(-1);
1275
1276 if (xmlCharEncodingAliases == NULL)
1277 return(-1);
1278 /*
1279 * Walk down the list looking for a definition of the alias
1280 */
1281 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1282 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1283 xmlFree((char *) xmlCharEncodingAliases[i].name);
1284 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1285 xmlCharEncodingAliasesNb--;
1286 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1287 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1288 return(0);
1289 }
1290 }
1291 return(-1);
1292}
1293
1294/**
1295 * xmlParseCharEncoding:
1296 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1297 *
1298 * Conpare the string to the known encoding schemes already known. Note
1299 * that the comparison is case insensitive accordingly to the section
1300 * [XML] 4.3.3 Character Encoding in Entities.
1301 *
1302 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1303 * if not recognized.
1304 */
1305xmlCharEncoding
1306xmlParseCharEncoding(const char* name)
1307{
1308 const char *alias;
1309 char upper[500];
1310 int i;
1311
1312 if (name == NULL)
1313 return(XML_CHAR_ENCODING_NONE);
1314
1315 /*
1316 * Do the alias resolution
1317 */
1318 alias = xmlGetEncodingAlias(name);
1319 if (alias != NULL)
1320 name = alias;
1321
1322 for (i = 0;i < 499;i++) {
1323 upper[i] = toupper(name[i]);
1324 if (upper[i] == 0) break;
1325 }
1326 upper[i] = 0;
1327
1328 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1329 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1330 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1331
1332 /*
1333 * NOTE: if we were able to parse this, the endianness of UTF16 is
1334 * already found and in use
1335 */
1336 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1337 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1338
1339 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1340 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1341 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1342
1343 /*
1344 * NOTE: if we were able to parse this, the endianness of UCS4 is
1345 * already found and in use
1346 */
1347 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1348 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1349 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1350
1351
1352 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1353 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1354 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1355
1356 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1357 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1358 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1359
1360 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1361 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1362 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1363 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1364 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1365 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1366 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1367
1368 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1369 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1370 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1371
1372#ifdef DEBUG_ENCODING
1373 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1374#endif
1375 return(XML_CHAR_ENCODING_ERROR);
1376}
1377
1378/**
1379 * xmlGetCharEncodingName:
1380 * @enc: the encoding
1381 *
1382 * The "canonical" name for XML encoding.
1383 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1384 * Section 4.3.3 Character Encoding in Entities
1385 *
1386 * Returns the canonical name for the given encoding
1387 */
1388
1389const char*
1390xmlGetCharEncodingName(xmlCharEncoding enc) {
1391 switch (enc) {
1392 case XML_CHAR_ENCODING_ERROR:
1393 return(NULL);
1394 case XML_CHAR_ENCODING_NONE:
1395 return(NULL);
1396 case XML_CHAR_ENCODING_UTF8:
1397 return("UTF-8");
1398 case XML_CHAR_ENCODING_UTF16LE:
1399 return("UTF-16");
1400 case XML_CHAR_ENCODING_UTF16BE:
1401 return("UTF-16");
1402 case XML_CHAR_ENCODING_EBCDIC:
1403 return("EBCDIC");
1404 case XML_CHAR_ENCODING_UCS4LE:
1405 return("ISO-10646-UCS-4");
1406 case XML_CHAR_ENCODING_UCS4BE:
1407 return("ISO-10646-UCS-4");
1408 case XML_CHAR_ENCODING_UCS4_2143:
1409 return("ISO-10646-UCS-4");
1410 case XML_CHAR_ENCODING_UCS4_3412:
1411 return("ISO-10646-UCS-4");
1412 case XML_CHAR_ENCODING_UCS2:
1413 return("ISO-10646-UCS-2");
1414 case XML_CHAR_ENCODING_8859_1:
1415 return("ISO-8859-1");
1416 case XML_CHAR_ENCODING_8859_2:
1417 return("ISO-8859-2");
1418 case XML_CHAR_ENCODING_8859_3:
1419 return("ISO-8859-3");
1420 case XML_CHAR_ENCODING_8859_4:
1421 return("ISO-8859-4");
1422 case XML_CHAR_ENCODING_8859_5:
1423 return("ISO-8859-5");
1424 case XML_CHAR_ENCODING_8859_6:
1425 return("ISO-8859-6");
1426 case XML_CHAR_ENCODING_8859_7:
1427 return("ISO-8859-7");
1428 case XML_CHAR_ENCODING_8859_8:
1429 return("ISO-8859-8");
1430 case XML_CHAR_ENCODING_8859_9:
1431 return("ISO-8859-9");
1432 case XML_CHAR_ENCODING_2022_JP:
1433 return("ISO-2022-JP");
1434 case XML_CHAR_ENCODING_SHIFT_JIS:
1435 return("Shift-JIS");
1436 case XML_CHAR_ENCODING_EUC_JP:
1437 return("EUC-JP");
1438 case XML_CHAR_ENCODING_ASCII:
1439 return(NULL);
1440 }
1441 return(NULL);
1442}
1443
Daniel Veillard97ac1312001-05-30 19:14:17 +00001444/************************************************************************
1445 * *
1446 * Char encoding handlers *
1447 * *
1448 ************************************************************************/
1449
Owen Taylor3473f882001-02-23 17:55:21 +00001450
1451/* the size should be growable, but it's not a big deal ... */
1452#define MAX_ENCODING_HANDLERS 50
1453static xmlCharEncodingHandlerPtr *handlers = NULL;
1454static int nbCharEncodingHandler = 0;
1455
1456/*
1457 * The default is UTF-8 for XML, that's also the default used for the
1458 * parser internals, so the default encoding handler is NULL
1459 */
1460
1461static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1462
1463/**
1464 * xmlNewCharEncodingHandler:
1465 * @name: the encoding name, in UTF-8 format (ASCII actually)
1466 * @input: the xmlCharEncodingInputFunc to read that encoding
1467 * @output: the xmlCharEncodingOutputFunc to write that encoding
1468 *
1469 * Create and registers an xmlCharEncodingHandler.
1470 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1471 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001472static xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001473xmlNewCharEncodingHandler(const char *name,
1474 xmlCharEncodingInputFunc input,
1475 xmlCharEncodingOutputFunc output) {
1476 xmlCharEncodingHandlerPtr handler;
1477 const char *alias;
1478 char upper[500];
1479 int i;
1480 char *up = 0;
1481
1482 /*
1483 * Do the alias resolution
1484 */
1485 alias = xmlGetEncodingAlias(name);
1486 if (alias != NULL)
1487 name = alias;
1488
1489 /*
1490 * Keep only the uppercase version of the encoding.
1491 */
1492 if (name == NULL) {
1493 xmlGenericError(xmlGenericErrorContext,
1494 "xmlNewCharEncodingHandler : no name !\n");
1495 return(NULL);
1496 }
1497 for (i = 0;i < 499;i++) {
1498 upper[i] = toupper(name[i]);
1499 if (upper[i] == 0) break;
1500 }
1501 upper[i] = 0;
1502 up = xmlMemStrdup(upper);
1503 if (up == NULL) {
1504 xmlGenericError(xmlGenericErrorContext,
1505 "xmlNewCharEncodingHandler : out of memory !\n");
1506 return(NULL);
1507 }
1508
1509 /*
1510 * allocate and fill-up an handler block.
1511 */
1512 handler = (xmlCharEncodingHandlerPtr)
1513 xmlMalloc(sizeof(xmlCharEncodingHandler));
1514 if (handler == NULL) {
1515 xmlGenericError(xmlGenericErrorContext,
1516 "xmlNewCharEncodingHandler : out of memory !\n");
1517 return(NULL);
1518 }
1519 handler->input = input;
1520 handler->output = output;
1521 handler->name = up;
1522
1523#ifdef LIBXML_ICONV_ENABLED
1524 handler->iconv_in = NULL;
1525 handler->iconv_out = NULL;
1526#endif /* LIBXML_ICONV_ENABLED */
1527
1528 /*
1529 * registers and returns the handler.
1530 */
1531 xmlRegisterCharEncodingHandler(handler);
1532#ifdef DEBUG_ENCODING
1533 xmlGenericError(xmlGenericErrorContext,
1534 "Registered encoding handler for %s\n", name);
1535#endif
1536 return(handler);
1537}
1538
1539/**
1540 * xmlInitCharEncodingHandlers:
1541 *
1542 * Initialize the char encoding support, it registers the default
1543 * encoding supported.
1544 * NOTE: while public, this function usually doesn't need to be called
1545 * in normal processing.
1546 */
1547void
1548xmlInitCharEncodingHandlers(void) {
1549 unsigned short int tst = 0x1234;
1550 unsigned char *ptr = (unsigned char *) &tst;
1551
1552 if (handlers != NULL) return;
1553
1554 handlers = (xmlCharEncodingHandlerPtr *)
1555 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1556
1557 if (*ptr == 0x12) xmlLittleEndian = 0;
1558 else if (*ptr == 0x34) xmlLittleEndian = 1;
1559 else xmlGenericError(xmlGenericErrorContext,
1560 "Odd problem at endianness detection\n");
1561
1562 if (handlers == NULL) {
1563 xmlGenericError(xmlGenericErrorContext,
1564 "xmlInitCharEncodingHandlers : out of memory !\n");
1565 return;
1566 }
1567 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1568 xmlUTF16LEHandler =
1569 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1570 xmlUTF16BEHandler =
1571 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1572 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1573 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001574 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001575#ifdef LIBXML_HTML_ENABLED
1576 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1577#endif
1578}
1579
1580/**
1581 * xmlCleanupCharEncodingHandlers:
1582 *
1583 * Cleanup the memory allocated for the char encoding support, it
1584 * unregisters all the encoding handlers and the aliases.
1585 */
1586void
1587xmlCleanupCharEncodingHandlers(void) {
1588 xmlCleanupEncodingAliases();
1589
1590 if (handlers == NULL) return;
1591
1592 for (;nbCharEncodingHandler > 0;) {
1593 nbCharEncodingHandler--;
1594 if (handlers[nbCharEncodingHandler] != NULL) {
1595 if (handlers[nbCharEncodingHandler]->name != NULL)
1596 xmlFree(handlers[nbCharEncodingHandler]->name);
1597 xmlFree(handlers[nbCharEncodingHandler]);
1598 }
1599 }
1600 xmlFree(handlers);
1601 handlers = NULL;
1602 nbCharEncodingHandler = 0;
1603 xmlDefaultCharEncodingHandler = NULL;
1604}
1605
1606/**
1607 * xmlRegisterCharEncodingHandler:
1608 * @handler: the xmlCharEncodingHandlerPtr handler block
1609 *
1610 * Register the char encoding handler, surprizing, isn't it ?
1611 */
1612void
1613xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1614 if (handlers == NULL) xmlInitCharEncodingHandlers();
1615 if (handler == NULL) {
1616 xmlGenericError(xmlGenericErrorContext,
1617 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1618 return;
1619 }
1620
1621 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1622 xmlGenericError(xmlGenericErrorContext,
1623 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1624 xmlGenericError(xmlGenericErrorContext,
1625 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1626 return;
1627 }
1628 handlers[nbCharEncodingHandler++] = handler;
1629}
1630
1631/**
1632 * xmlGetCharEncodingHandler:
1633 * @enc: an xmlCharEncoding value.
1634 *
1635 * Search in the registrered set the handler able to read/write that encoding.
1636 *
1637 * Returns the handler or NULL if not found
1638 */
1639xmlCharEncodingHandlerPtr
1640xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1641 xmlCharEncodingHandlerPtr handler;
1642
1643 if (handlers == NULL) xmlInitCharEncodingHandlers();
1644 switch (enc) {
1645 case XML_CHAR_ENCODING_ERROR:
1646 return(NULL);
1647 case XML_CHAR_ENCODING_NONE:
1648 return(NULL);
1649 case XML_CHAR_ENCODING_UTF8:
1650 return(NULL);
1651 case XML_CHAR_ENCODING_UTF16LE:
1652 return(xmlUTF16LEHandler);
1653 case XML_CHAR_ENCODING_UTF16BE:
1654 return(xmlUTF16BEHandler);
1655 case XML_CHAR_ENCODING_EBCDIC:
1656 handler = xmlFindCharEncodingHandler("EBCDIC");
1657 if (handler != NULL) return(handler);
1658 handler = xmlFindCharEncodingHandler("ebcdic");
1659 if (handler != NULL) return(handler);
1660 break;
1661 case XML_CHAR_ENCODING_UCS4BE:
1662 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1663 if (handler != NULL) return(handler);
1664 handler = xmlFindCharEncodingHandler("UCS-4");
1665 if (handler != NULL) return(handler);
1666 handler = xmlFindCharEncodingHandler("UCS4");
1667 if (handler != NULL) return(handler);
1668 break;
1669 case XML_CHAR_ENCODING_UCS4LE:
1670 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1671 if (handler != NULL) return(handler);
1672 handler = xmlFindCharEncodingHandler("UCS-4");
1673 if (handler != NULL) return(handler);
1674 handler = xmlFindCharEncodingHandler("UCS4");
1675 if (handler != NULL) return(handler);
1676 break;
1677 case XML_CHAR_ENCODING_UCS4_2143:
1678 break;
1679 case XML_CHAR_ENCODING_UCS4_3412:
1680 break;
1681 case XML_CHAR_ENCODING_UCS2:
1682 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1683 if (handler != NULL) return(handler);
1684 handler = xmlFindCharEncodingHandler("UCS-2");
1685 if (handler != NULL) return(handler);
1686 handler = xmlFindCharEncodingHandler("UCS2");
1687 if (handler != NULL) return(handler);
1688 break;
1689
1690 /*
1691 * We used to keep ISO Latin encodings native in the
1692 * generated data. This led to so many problems that
1693 * this has been removed. One can still change this
1694 * back by registering no-ops encoders for those
1695 */
1696 case XML_CHAR_ENCODING_8859_1:
1697 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1698 if (handler != NULL) return(handler);
1699 break;
1700 case XML_CHAR_ENCODING_8859_2:
1701 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1702 if (handler != NULL) return(handler);
1703 break;
1704 case XML_CHAR_ENCODING_8859_3:
1705 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1706 if (handler != NULL) return(handler);
1707 break;
1708 case XML_CHAR_ENCODING_8859_4:
1709 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1710 if (handler != NULL) return(handler);
1711 break;
1712 case XML_CHAR_ENCODING_8859_5:
1713 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1714 if (handler != NULL) return(handler);
1715 break;
1716 case XML_CHAR_ENCODING_8859_6:
1717 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1718 if (handler != NULL) return(handler);
1719 break;
1720 case XML_CHAR_ENCODING_8859_7:
1721 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1722 if (handler != NULL) return(handler);
1723 break;
1724 case XML_CHAR_ENCODING_8859_8:
1725 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1726 if (handler != NULL) return(handler);
1727 break;
1728 case XML_CHAR_ENCODING_8859_9:
1729 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1730 if (handler != NULL) return(handler);
1731 break;
1732
1733
1734 case XML_CHAR_ENCODING_2022_JP:
1735 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1736 if (handler != NULL) return(handler);
1737 break;
1738 case XML_CHAR_ENCODING_SHIFT_JIS:
1739 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1740 if (handler != NULL) return(handler);
1741 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1742 if (handler != NULL) return(handler);
1743 handler = xmlFindCharEncodingHandler("Shift_JIS");
1744 if (handler != NULL) return(handler);
1745 break;
1746 case XML_CHAR_ENCODING_EUC_JP:
1747 handler = xmlFindCharEncodingHandler("EUC-JP");
1748 if (handler != NULL) return(handler);
1749 break;
1750 default:
1751 break;
1752 }
1753
1754#ifdef DEBUG_ENCODING
1755 xmlGenericError(xmlGenericErrorContext,
1756 "No handler found for encoding %d\n", enc);
1757#endif
1758 return(NULL);
1759}
1760
1761/**
1762 * xmlGetCharEncodingHandler:
1763 * @enc: a string describing the char encoding.
1764 *
1765 * Search in the registrered set the handler able to read/write that encoding.
1766 *
1767 * Returns the handler or NULL if not found
1768 */
1769xmlCharEncodingHandlerPtr
1770xmlFindCharEncodingHandler(const char *name) {
1771 const char *nalias;
1772 const char *norig;
1773 xmlCharEncoding alias;
1774#ifdef LIBXML_ICONV_ENABLED
1775 xmlCharEncodingHandlerPtr enc;
1776 iconv_t icv_in, icv_out;
1777#endif /* LIBXML_ICONV_ENABLED */
1778 char upper[100];
1779 int i;
1780
1781 if (handlers == NULL) xmlInitCharEncodingHandlers();
1782 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1783 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1784
1785 /*
1786 * Do the alias resolution
1787 */
1788 norig = name;
1789 nalias = xmlGetEncodingAlias(name);
1790 if (nalias != NULL)
1791 name = nalias;
1792
1793 /*
1794 * Check first for directly registered encoding names
1795 */
1796 for (i = 0;i < 99;i++) {
1797 upper[i] = toupper(name[i]);
1798 if (upper[i] == 0) break;
1799 }
1800 upper[i] = 0;
1801
1802 for (i = 0;i < nbCharEncodingHandler; i++)
1803 if (!strcmp(upper, handlers[i]->name)) {
1804#ifdef DEBUG_ENCODING
1805 xmlGenericError(xmlGenericErrorContext,
1806 "Found registered handler for encoding %s\n", name);
1807#endif
1808 return(handlers[i]);
1809 }
1810
1811#ifdef LIBXML_ICONV_ENABLED
1812 /* check whether iconv can handle this */
1813 icv_in = iconv_open("UTF-8", name);
1814 icv_out = iconv_open(name, "UTF-8");
1815 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1816 enc = (xmlCharEncodingHandlerPtr)
1817 xmlMalloc(sizeof(xmlCharEncodingHandler));
1818 if (enc == NULL) {
1819 iconv_close(icv_in);
1820 iconv_close(icv_out);
1821 return(NULL);
1822 }
1823 enc->name = xmlMemStrdup(name);
1824 enc->input = NULL;
1825 enc->output = NULL;
1826 enc->iconv_in = icv_in;
1827 enc->iconv_out = icv_out;
1828#ifdef DEBUG_ENCODING
1829 xmlGenericError(xmlGenericErrorContext,
1830 "Found iconv handler for encoding %s\n", name);
1831#endif
1832 return enc;
1833 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1834 xmlGenericError(xmlGenericErrorContext,
1835 "iconv : problems with filters for '%s'\n", name);
1836 }
1837#endif /* LIBXML_ICONV_ENABLED */
1838
1839#ifdef DEBUG_ENCODING
1840 xmlGenericError(xmlGenericErrorContext,
1841 "No handler found for encoding %s\n", name);
1842#endif
1843
1844 /*
1845 * Fallback using the canonical names
1846 */
1847 alias = xmlParseCharEncoding(norig);
1848 if (alias != XML_CHAR_ENCODING_ERROR) {
1849 const char* canon;
1850 canon = xmlGetCharEncodingName(alias);
1851 if ((canon != NULL) && (strcmp(name, canon))) {
1852 return(xmlFindCharEncodingHandler(canon));
1853 }
1854 }
1855
1856 return(NULL);
1857}
1858
Daniel Veillard97ac1312001-05-30 19:14:17 +00001859/************************************************************************
1860 * *
1861 * ICONV based generic conversion functions *
1862 * *
1863 ************************************************************************/
1864
Owen Taylor3473f882001-02-23 17:55:21 +00001865#ifdef LIBXML_ICONV_ENABLED
1866/**
1867 * xmlIconvWrapper:
1868 * @cd: iconv converter data structure
1869 * @out: a pointer to an array of bytes to store the result
1870 * @outlen: the length of @out
1871 * @in: a pointer to an array of ISO Latin 1 chars
1872 * @inlen: the length of @in
1873 *
1874 * Returns 0 if success, or
1875 * -1 by lack of space, or
1876 * -2 if the transcoding fails (for *in is not valid utf8 string or
1877 * the result of transformation can't fit into the encoding we want), or
1878 * -3 if there the last byte can't form a single output char.
1879 *
1880 * The value of @inlen after return is the number of octets consumed
1881 * as the return value is positive, else unpredictiable.
1882 * The value of @outlen after return is the number of ocetes consumed.
1883 */
1884static int
1885xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001886 unsigned char *out, int *outlen,
1887 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001888
Daniel Veillard9403a042001-05-28 11:00:53 +00001889 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1890 const char *icv_in = (const char *) in;
1891 char *icv_out = (char *) out;
1892 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001893
Daniel Veillard9403a042001-05-28 11:00:53 +00001894 ret = iconv(cd, &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1895 if (in != NULL) {
1896 *inlen -= icv_inlen;
1897 *outlen -= icv_outlen;
1898 } else {
1899 *inlen = 0;
1900 *outlen = 0;
1901 }
1902 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001903#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00001904 if (errno == EILSEQ) {
1905 return -2;
1906 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001907#endif
1908#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00001909 if (errno == E2BIG) {
1910 return -1;
1911 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001912#endif
1913#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00001914 if (errno == EINVAL) {
1915 return -3;
1916 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001917#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00001918 {
1919 return -3;
1920 }
1921 }
1922 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001923}
1924#endif /* LIBXML_ICONV_ENABLED */
1925
Daniel Veillard97ac1312001-05-30 19:14:17 +00001926/************************************************************************
1927 * *
1928 * The real API used by libxml for on-the-fly conversion *
1929 * *
1930 ************************************************************************/
1931
Owen Taylor3473f882001-02-23 17:55:21 +00001932/**
1933 * xmlCharEncFirstLine:
1934 * @handler: char enconding transformation data structure
1935 * @out: an xmlBuffer for the output.
1936 * @in: an xmlBuffer for the input
1937 *
1938 * Front-end for the encoding handler input function, but handle only
1939 * the very first line, i.e. limit itself to 45 chars.
1940 *
1941 * Returns the number of byte written if success, or
1942 * -1 general error
1943 * -2 if the transcoding fails (for *in is not valid utf8 string or
1944 * the result of transformation can't fit into the encoding we want), or
1945 */
1946int
1947xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1948 xmlBufferPtr in) {
1949 int ret = -2;
1950 int written;
1951 int toconv;
1952
1953 if (handler == NULL) return(-1);
1954 if (out == NULL) return(-1);
1955 if (in == NULL) return(-1);
1956
1957 written = out->size - out->use;
1958 toconv = in->use;
1959 if (toconv * 2 >= written) {
1960 xmlBufferGrow(out, toconv);
1961 written = out->size - out->use - 1;
1962 }
1963
1964 /*
1965 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1966 * 45 chars should be sufficient to reach the end of the encoding
1967 * decalration without going too far inside the document content.
1968 */
1969 written = 45;
1970
1971 if (handler->input != NULL) {
1972 ret = handler->input(&out->content[out->use], &written,
1973 in->content, &toconv);
1974 xmlBufferShrink(in, toconv);
1975 out->use += written;
1976 out->content[out->use] = 0;
1977 }
1978#ifdef LIBXML_ICONV_ENABLED
1979 else if (handler->iconv_in != NULL) {
1980 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1981 &written, in->content, &toconv);
1982 xmlBufferShrink(in, toconv);
1983 out->use += written;
1984 out->content[out->use] = 0;
1985 if (ret == -1) ret = -3;
1986 }
1987#endif /* LIBXML_ICONV_ENABLED */
1988#ifdef DEBUG_ENCODING
1989 switch (ret) {
1990 case 0:
1991 xmlGenericError(xmlGenericErrorContext,
1992 "converted %d bytes to %d bytes of input\n",
1993 toconv, written);
1994 break;
1995 case -1:
1996 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1997 toconv, written, in->use);
1998 break;
1999 case -2:
2000 xmlGenericError(xmlGenericErrorContext,
2001 "input conversion failed due to input error\n");
2002 break;
2003 case -3:
2004 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2005 toconv, written, in->use);
2006 break;
2007 default:
2008 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2009 }
2010#endif
2011 /*
2012 * Ignore when input buffer is not on a boundary
2013 */
2014 if (ret == -3) ret = 0;
2015 if (ret == -1) ret = 0;
2016 return(ret);
2017}
2018
2019/**
2020 * xmlCharEncInFunc:
2021 * @handler: char enconding transformation data structure
2022 * @out: an xmlBuffer for the output.
2023 * @in: an xmlBuffer for the input
2024 *
2025 * Generic front-end for the encoding handler input function
2026 *
2027 * Returns the number of byte written if success, or
2028 * -1 general error
2029 * -2 if the transcoding fails (for *in is not valid utf8 string or
2030 * the result of transformation can't fit into the encoding we want), or
2031 */
2032int
2033xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2034 xmlBufferPtr in) {
2035 int ret = -2;
2036 int written;
2037 int toconv;
2038
2039 if (handler == NULL) return(-1);
2040 if (out == NULL) return(-1);
2041 if (in == NULL) return(-1);
2042
2043 toconv = in->use;
2044 if (toconv == 0)
2045 return(0);
2046 written = out->size - out->use;
2047 if (toconv * 2 >= written) {
2048 xmlBufferGrow(out, out->size + toconv * 2);
2049 written = out->size - out->use - 1;
2050 }
2051 if (handler->input != NULL) {
2052 ret = handler->input(&out->content[out->use], &written,
2053 in->content, &toconv);
2054 xmlBufferShrink(in, toconv);
2055 out->use += written;
2056 out->content[out->use] = 0;
2057 }
2058#ifdef LIBXML_ICONV_ENABLED
2059 else if (handler->iconv_in != NULL) {
2060 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2061 &written, in->content, &toconv);
2062 xmlBufferShrink(in, toconv);
2063 out->use += written;
2064 out->content[out->use] = 0;
2065 if (ret == -1) ret = -3;
2066 }
2067#endif /* LIBXML_ICONV_ENABLED */
2068 switch (ret) {
2069#ifdef DEBUG_ENCODING
2070 case 0:
2071 xmlGenericError(xmlGenericErrorContext,
2072 "converted %d bytes to %d bytes of input\n",
2073 toconv, written);
2074 break;
2075 case -1:
2076 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2077 toconv, written, in->use);
2078 break;
2079 case -3:
2080 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2081 toconv, written, in->use);
2082 break;
2083#endif
2084 case -2:
2085 xmlGenericError(xmlGenericErrorContext,
2086 "input conversion failed due to input error\n");
2087 xmlGenericError(xmlGenericErrorContext,
2088 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2089 in->content[0], in->content[1],
2090 in->content[2], in->content[3]);
2091 }
2092 /*
2093 * Ignore when input buffer is not on a boundary
2094 */
2095 if (ret == -3) ret = 0;
2096 return(ret);
2097}
2098
2099/**
2100 * xmlCharEncOutFunc:
2101 * @handler: char enconding transformation data structure
2102 * @out: an xmlBuffer for the output.
2103 * @in: an xmlBuffer for the input
2104 *
2105 * Generic front-end for the encoding handler output function
2106 * a first call with @in == NULL has to be made firs to initiate the
2107 * output in case of non-stateless encoding needing to initiate their
2108 * state or the output (like the BOM in UTF16).
2109 * In case of UTF8 sequence conversion errors for the given encoder,
2110 * the content will be automatically remapped to a CharRef sequence.
2111 *
2112 * Returns the number of byte written if success, or
2113 * -1 general error
2114 * -2 if the transcoding fails (for *in is not valid utf8 string or
2115 * the result of transformation can't fit into the encoding we want), or
2116 */
2117int
2118xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2119 xmlBufferPtr in) {
2120 int ret = -2;
2121 int written;
2122 int writtentot = 0;
2123 int toconv;
2124 int output = 0;
2125
2126 if (handler == NULL) return(-1);
2127 if (out == NULL) return(-1);
2128
2129retry:
2130
2131 written = out->size - out->use;
2132
2133 /*
2134 * First specific handling of in = NULL, i.e. the initialization call
2135 */
2136 if (in == NULL) {
2137 toconv = 0;
2138 if (handler->output != NULL) {
2139 ret = handler->output(&out->content[out->use], &written,
2140 NULL, &toconv);
2141 out->use += written;
2142 out->content[out->use] = 0;
2143 }
2144#ifdef LIBXML_ICONV_ENABLED
2145 else if (handler->iconv_out != NULL) {
2146 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2147 &written, NULL, &toconv);
2148 out->use += written;
2149 out->content[out->use] = 0;
2150 }
2151#endif /* LIBXML_ICONV_ENABLED */
2152#ifdef DEBUG_ENCODING
2153 xmlGenericError(xmlGenericErrorContext,
2154 "initialized encoder\n");
2155#endif
2156 return(0);
2157 }
2158
2159 /*
2160 * Convertion itself.
2161 */
2162 toconv = in->use;
2163 if (toconv == 0)
2164 return(0);
2165 if (toconv * 2 >= written) {
2166 xmlBufferGrow(out, toconv * 2);
2167 written = out->size - out->use - 1;
2168 }
2169 if (handler->output != NULL) {
2170 ret = handler->output(&out->content[out->use], &written,
2171 in->content, &toconv);
2172 xmlBufferShrink(in, toconv);
2173 out->use += written;
2174 writtentot += written;
2175 out->content[out->use] = 0;
2176 }
2177#ifdef LIBXML_ICONV_ENABLED
2178 else if (handler->iconv_out != NULL) {
2179 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2180 &written, in->content, &toconv);
2181 xmlBufferShrink(in, toconv);
2182 out->use += written;
2183 writtentot += written;
2184 out->content[out->use] = 0;
2185 if (ret == -1) {
2186 if (written > 0) {
2187 /*
2188 * Can be a limitation of iconv
2189 */
2190 goto retry;
2191 }
2192 ret = -3;
2193 }
2194 }
2195#endif /* LIBXML_ICONV_ENABLED */
2196 else {
2197 xmlGenericError(xmlGenericErrorContext,
2198 "xmlCharEncOutFunc: no output function !\n");
2199 return(-1);
2200 }
2201
2202 if (ret >= 0) output += ret;
2203
2204 /*
2205 * Attempt to handle error cases
2206 */
2207 switch (ret) {
2208#ifdef DEBUG_ENCODING
2209 case 0:
2210 xmlGenericError(xmlGenericErrorContext,
2211 "converted %d bytes to %d bytes of output\n",
2212 toconv, written);
2213 break;
2214 case -1:
2215 xmlGenericError(xmlGenericErrorContext,
2216 "output conversion failed by lack of space\n");
2217 break;
2218#endif
2219 case -3:
2220 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2221 toconv, written, in->use);
2222 break;
2223 case -2: {
2224 int len = in->use;
2225 const xmlChar *utf = (const xmlChar *) in->content;
2226 int cur;
2227
2228 cur = xmlGetUTF8Char(utf, &len);
2229 if (cur > 0) {
2230 xmlChar charref[20];
2231
2232#ifdef DEBUG_ENCODING
2233 xmlGenericError(xmlGenericErrorContext,
2234 "handling output conversion error\n");
2235 xmlGenericError(xmlGenericErrorContext,
2236 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2237 in->content[0], in->content[1],
2238 in->content[2], in->content[3]);
2239#endif
2240 /*
2241 * Removes the UTF8 sequence, and replace it by a charref
2242 * and continue the transcoding phase, hoping the error
2243 * did not mangle the encoder state.
2244 */
2245 sprintf((char *) charref, "&#x%X;", cur);
2246 xmlBufferShrink(in, len);
2247 xmlBufferAddHead(in, charref, -1);
2248
2249 goto retry;
2250 } else {
2251 xmlGenericError(xmlGenericErrorContext,
2252 "output conversion failed due to conv error\n");
2253 xmlGenericError(xmlGenericErrorContext,
2254 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2255 in->content[0], in->content[1],
2256 in->content[2], in->content[3]);
2257 in->content[0] = ' ';
2258 }
2259 break;
2260 }
2261 }
2262 return(ret);
2263}
2264
2265/**
2266 * xmlCharEncCloseFunc:
2267 * @handler: char enconding transformation data structure
2268 *
2269 * Generic front-end for hencoding handler close function
2270 *
2271 * Returns 0 if success, or -1 in case of error
2272 */
2273int
2274xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2275 int ret = 0;
2276 if (handler == NULL) return(-1);
2277 if (handler->name == NULL) return(-1);
2278#ifdef LIBXML_ICONV_ENABLED
2279 /*
2280 * Iconv handlers can be oused only once, free the whole block.
2281 * and the associated icon resources.
2282 */
2283 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2284 if (handler->name != NULL)
2285 xmlFree(handler->name);
2286 handler->name = NULL;
2287 if (handler->iconv_out != NULL) {
2288 if (iconv_close(handler->iconv_out))
2289 ret = -1;
2290 handler->iconv_out = NULL;
2291 }
2292 if (handler->iconv_in != NULL) {
2293 if (iconv_close(handler->iconv_in))
2294 ret = -1;
2295 handler->iconv_in = NULL;
2296 }
2297 xmlFree(handler);
2298 }
2299#endif /* LIBXML_ICONV_ENABLED */
2300#ifdef DEBUG_ENCODING
2301 if (ret)
2302 xmlGenericError(xmlGenericErrorContext,
2303 "failed to close the encoding handler\n");
2304 else
2305 xmlGenericError(xmlGenericErrorContext,
2306 "closed the encoding handler\n");
2307
2308#endif
2309 return(ret);
2310}
2311