blob: a9495c429072eec91cea9eee7d1dff080188523a [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
Daniel Veillardc5d64342001-06-24 12:13:24 +000018 * daniel@veillard.com
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Bjorn Reese70a9da52001-04-21 16:57:29 +000026#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000027
Owen Taylor3473f882001-02-23 17:55:21 +000028#include <string.h>
29
30#ifdef HAVE_CTYPE_H
31#include <ctype.h>
32#endif
33#ifdef HAVE_STDLIB_H
34#include <stdlib.h>
35#endif
Owen Taylor3473f882001-02-23 17:55:21 +000036#ifdef LIBXML_ICONV_ENABLED
37#ifdef HAVE_ERRNO_H
38#include <errno.h>
39#endif
40#endif
41#include <libxml/encoding.h>
42#include <libxml/xmlmemory.h>
43#ifdef LIBXML_HTML_ENABLED
44#include <libxml/HTMLparser.h>
45#endif
46#include <libxml/xmlerror.h>
Daniel Veillard64a411c2001-10-15 12:32:07 +000047#include <libxml/globals.h>
Owen Taylor3473f882001-02-23 17:55:21 +000048
Daniel Veillard22090732001-07-16 00:06:07 +000049static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
50static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +000051
52typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
53typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
54struct _xmlCharEncodingAlias {
55 const char *name;
56 const char *alias;
57};
58
59static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
60static int xmlCharEncodingAliasesNb = 0;
61static int xmlCharEncodingAliasesMax = 0;
62
63#ifdef LIBXML_ICONV_ENABLED
64#if 0
65#define DEBUG_ENCODING /* Define this to get encoding traces */
66#endif
67#endif
68
69static int xmlLittleEndian = 1;
70
Daniel Veillard97ac1312001-05-30 19:14:17 +000071/************************************************************************
72 * *
73 * Generic UTF8 handling routines *
74 * *
75 * From rfc2044: encoding of the Unicode values on UTF-8: *
76 * *
77 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
78 * 0000 0000-0000 007F 0xxxxxxx *
79 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
80 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
81 * *
82 * I hope we won't use values > 0xFFFF anytime soon ! *
83 * *
84 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000085
86/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000087 * xmlUTF8Strlen:
88 * @utf: a sequence of UTF-8 encoded bytes
89 *
Daniel Veillard60087f32001-10-10 09:45:09 +000090 * compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillarde043ee12001-04-16 14:08:07 +000091 * checking of the content of the string.
92 *
93 * Returns the number of characters in the string or -1 in case of error
94 */
95int
Daniel Veillard97ac1312001-05-30 19:14:17 +000096xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +000097 int ret = 0;
98
99 if (utf == NULL)
100 return(-1);
101
102 while (*utf != 0) {
103 if (utf[0] & 0x80) {
104 if ((utf[1] & 0xc0) != 0x80)
105 return(-1);
106 if ((utf[0] & 0xe0) == 0xe0) {
107 if ((utf[2] & 0xc0) != 0x80)
108 return(-1);
109 if ((utf[0] & 0xf0) == 0xf0) {
110 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
111 return(-1);
112 utf += 4;
113 } else {
114 utf += 3;
115 }
116 } else {
117 utf += 2;
118 }
119 } else {
120 utf++;
121 }
122 ret++;
123 }
124 return(ret);
125}
126
127/**
Owen Taylor3473f882001-02-23 17:55:21 +0000128 * xmlGetUTF8Char:
129 * @utf: a sequence of UTF-8 encoded bytes
130 * @len: a pointer to @bytes len
131 *
132 * Read one UTF8 Char from @utf
133 *
134 * Returns the char value or -1 in case of error and update @len with the
135 * number of bytes used
136 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000137static int
Owen Taylor3473f882001-02-23 17:55:21 +0000138xmlGetUTF8Char(const unsigned char *utf, int *len) {
139 unsigned int c;
140
141 if (utf == NULL)
142 goto error;
143 if (len == NULL)
144 goto error;
145 if (*len < 1)
146 goto error;
147
148 c = utf[0];
149 if (c & 0x80) {
150 if (*len < 2)
151 goto error;
152 if ((utf[1] & 0xc0) != 0x80)
153 goto error;
154 if ((c & 0xe0) == 0xe0) {
155 if (*len < 3)
156 goto error;
157 if ((utf[2] & 0xc0) != 0x80)
158 goto error;
159 if ((c & 0xf0) == 0xf0) {
160 if (*len < 4)
161 goto error;
162 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
163 goto error;
164 *len = 4;
165 /* 4-byte code */
166 c = (utf[0] & 0x7) << 18;
167 c |= (utf[1] & 0x3f) << 12;
168 c |= (utf[2] & 0x3f) << 6;
169 c |= utf[3] & 0x3f;
170 } else {
171 /* 3-byte code */
172 *len = 3;
173 c = (utf[0] & 0xf) << 12;
174 c |= (utf[1] & 0x3f) << 6;
175 c |= utf[2] & 0x3f;
176 }
177 } else {
178 /* 2-byte code */
179 *len = 2;
180 c = (utf[0] & 0x1f) << 6;
181 c |= utf[1] & 0x3f;
182 }
183 } else {
184 /* 1-byte code */
185 *len = 1;
186 }
187 return(c);
188
189error:
190 *len = 0;
191 return(-1);
192}
193
194/**
195 * xmlCheckUTF8: Check utf-8 string for legality.
196 * @utf: Pointer to putative utf-8 encoded string.
197 *
198 * Checks @utf for being valid utf-8. @utf is assumed to be
199 * null-terminated. This function is not super-strict, as it will
200 * allow longer utf-8 sequences than necessary. Note that Java is
201 * capable of producing these sequences if provoked. Also note, this
202 * routine checks for the 4-byte maxiumum size, but does not check for
203 * 0x10ffff maximum value.
204 *
205 * Return value: true if @utf is valid.
206 **/
207int
208xmlCheckUTF8(const unsigned char *utf)
209{
210 int ix;
211 unsigned char c;
212
213 for (ix = 0; (c = utf[ix]);) {
214 if (c & 0x80) {
215 if ((utf[ix + 1] & 0xc0) != 0x80)
216 return(0);
217 if ((c & 0xe0) == 0xe0) {
218 if ((utf[ix + 2] & 0xc0) != 0x80)
219 return(0);
220 if ((c & 0xf0) == 0xf0) {
221 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
222 return(0);
223 ix += 4;
224 /* 4-byte code */
225 } else
226 /* 3-byte code */
227 ix += 3;
228 } else
229 /* 2-byte code */
230 ix += 2;
231 } else
232 /* 1-byte code */
233 ix++;
234 }
235 return(1);
236}
237
238/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000239 * xmlUTF8Strsize:
240 * @utf: a sequence of UTF-8 encoded bytes
241 * @len: the number of characters in the array
242 *
243 * storage size of an UTF8 string
244 *
245 * Returns the storage size of
246 * the first 'len' characters of ARRAY
247 *
248 */
249
250int
251xmlUTF8Strsize(const xmlChar *utf, int len) {
252 const xmlChar *ptr=utf;
253 xmlChar ch;
254
255 if (len <= 0)
256 return(0);
257
258 while ( len-- > 0) {
259 if ( !*ptr )
260 break;
261 if ( (ch = *ptr++) & 0x80)
262 while ( (ch<<=1) & 0x80 )
263 ptr++;
264 }
265 return (ptr - utf);
266}
267
268
269/**
270 * xmlUTF8Strndup:
271 * @utf: the input UTF8 *
272 * @len: the len of @utf (in chars)
273 *
274 * a strndup for array of UTF8's
275 *
276 * Returns a new UTF8 * or NULL
277 */
278xmlChar *
279xmlUTF8Strndup(const xmlChar *utf, int len) {
280 xmlChar *ret;
281 int i;
282
283 if ((utf == NULL) || (len < 0)) return(NULL);
284 i = xmlUTF8Strsize(utf, len);
285 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
286 if (ret == NULL) {
287 xmlGenericError(xmlGenericErrorContext,
288 "malloc of %ld byte failed\n",
289 (len + 1) * (long)sizeof(xmlChar));
290 return(NULL);
291 }
292 memcpy(ret, utf, i * sizeof(xmlChar));
293 ret[i] = 0;
294 return(ret);
295}
296
297/**
298 * xmlUTF8Strpos:
299 * @utf: the input UTF8 *
300 * @pos: the position of the desired UTF8 char (in chars)
301 *
302 * a function to provide the equivalent of fetching a
303 * character from a string array
304 *
305 * Returns a pointer to the UTF8 character or NULL
306 */
307xmlChar *
308xmlUTF8Strpos(const xmlChar *utf, int pos) {
309 xmlChar ch;
310
311 if (utf == NULL) return(NULL);
312 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
313 return(NULL);
314 while (pos--) {
315 if ((ch=*utf++) == 0) return(NULL);
316 if ( ch & 0x80 ) {
317 /* if not simple ascii, verify proper format */
318 if ( (ch & 0xc0) != 0xc0 )
319 return(NULL);
320 /* then skip over remaining bytes for this char */
321 while ( (ch <<= 1) & 0x80 )
322 if ( (*utf++ & 0xc0) != 0x80 )
323 return(NULL);
324 }
325 }
326 return((xmlChar *)utf);
327}
328
329/**
330 * xmlUTF8Strloc:
331 * @utf: the input UTF8 *
332 * @utfchar: the UTF8 character to be found
333 *
334 * a function to provide relative location of a UTF8 char
335 *
336 * Returns the relative character position of the desired char
337 * or -1 if not found
338 */
339int
340xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
341 int i, size;
342 xmlChar ch;
343
344 if (utf==NULL || utfchar==NULL) return -1;
345 size = xmlUTF8Strsize(utfchar, 1);
346 for(i=0; (ch=*utf) != 0; i++) {
347 if (xmlStrncmp(utf, utfchar, size)==0)
348 return(i);
349 utf++;
350 if ( ch & 0x80 ) {
351 /* if not simple ascii, verify proper format */
352 if ( (ch & 0xc0) != 0xc0 )
353 return(-1);
354 /* then skip over remaining bytes for this char */
355 while ( (ch <<= 1) & 0x80 )
356 if ( (*utf++ & 0xc0) != 0x80 )
357 return(-1);
358 }
359 }
360
361 return(-1);
362}
363/**
364 * xmlUTF8Strsub:
365 * @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard97ac1312001-05-30 19:14:17 +0000366 * @start: relative pos of first char
367 * @len: total number to copy
368 *
369 * Note: positions are given in units of UTF-8 chars
370 *
371 * Returns a pointer to a newly created string
372 * or NULL if any problem
373 */
374
375xmlChar *
376xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
377 int i;
378 xmlChar ch;
379
380 if (utf == NULL) return(NULL);
381 if (start < 0) return(NULL);
382 if (len < 0) return(NULL);
383
384 /*
385 * Skip over any leading chars
386 */
387 for (i = 0;i < start;i++) {
388 if ((ch=*utf++) == 0) return(NULL);
389 if ( ch & 0x80 ) {
390 /* if not simple ascii, verify proper format */
391 if ( (ch & 0xc0) != 0xc0 )
392 return(NULL);
393 /* then skip over remaining bytes for this char */
394 while ( (ch <<= 1) & 0x80 )
395 if ( (*utf++ & 0xc0) != 0x80 )
396 return(NULL);
397 }
398 }
399
400 return(xmlUTF8Strndup(utf, len));
401}
402
403/************************************************************************
404 * *
405 * Conversions To/From UTF8 encoding *
406 * *
407 ************************************************************************/
408
409/**
Owen Taylor3473f882001-02-23 17:55:21 +0000410 * asciiToUTF8:
411 * @out: a pointer to an array of bytes to store the result
412 * @outlen: the length of @out
413 * @in: a pointer to an array of ASCII chars
414 * @inlen: the length of @in
415 *
416 * Take a block of ASCII chars in and try to convert it to an UTF-8
417 * block of chars out.
418 * Returns 0 if success, or -1 otherwise
419 * The value of @inlen after return is the number of octets consumed
420 * as the return value is positive, else unpredictiable.
421 * The value of @outlen after return is the number of ocetes consumed.
422 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000423static int
Owen Taylor3473f882001-02-23 17:55:21 +0000424asciiToUTF8(unsigned char* out, int *outlen,
425 const unsigned char* in, int *inlen) {
426 unsigned char* outstart = out;
427 const unsigned char* base = in;
428 const unsigned char* processed = in;
429 unsigned char* outend = out + *outlen;
430 const unsigned char* inend;
431 unsigned int c;
432 int bits;
433
434 inend = in + (*inlen);
435 while ((in < inend) && (out - outstart + 5 < *outlen)) {
436 c= *in++;
437
438 /* assertion: c is a single UTF-4 value */
439 if (out >= outend)
440 break;
441 if (c < 0x80) { *out++= c; bits= -6; }
442 else {
443 *outlen = out - outstart;
444 *inlen = processed - base;
445 return(-1);
446 }
447
448 for ( ; bits >= 0; bits-= 6) {
449 if (out >= outend)
450 break;
451 *out++= ((c >> bits) & 0x3F) | 0x80;
452 }
453 processed = (const unsigned char*) in;
454 }
455 *outlen = out - outstart;
456 *inlen = processed - base;
457 return(0);
458}
459
460/**
461 * UTF8Toascii:
462 * @out: a pointer to an array of bytes to store the result
463 * @outlen: the length of @out
464 * @in: a pointer to an array of UTF-8 chars
465 * @inlen: the length of @in
466 *
467 * Take a block of UTF-8 chars in and try to convert it to an ASCII
468 * block of chars out.
469 *
470 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
471 * The value of @inlen after return is the number of octets consumed
472 * as the return value is positive, else unpredictiable.
473 * The value of @outlen after return is the number of ocetes consumed.
474 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000475static int
Owen Taylor3473f882001-02-23 17:55:21 +0000476UTF8Toascii(unsigned char* out, int *outlen,
477 const unsigned char* in, int *inlen) {
478 const unsigned char* processed = in;
479 const unsigned char* outend;
480 const unsigned char* outstart = out;
481 const unsigned char* instart = in;
482 const unsigned char* inend;
483 unsigned int c, d;
484 int trailing;
485
486 if (in == NULL) {
487 /*
488 * initialization nothing to do
489 */
490 *outlen = 0;
491 *inlen = 0;
492 return(0);
493 }
494 inend = in + (*inlen);
495 outend = out + (*outlen);
496 while (in < inend) {
497 d = *in++;
498 if (d < 0x80) { c= d; trailing= 0; }
499 else if (d < 0xC0) {
500 /* trailing byte in leading position */
501 *outlen = out - outstart;
502 *inlen = processed - instart;
503 return(-2);
504 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
505 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
506 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
507 else {
508 /* no chance for this in Ascii */
509 *outlen = out - outstart;
510 *inlen = processed - instart;
511 return(-2);
512 }
513
514 if (inend - in < trailing) {
515 break;
516 }
517
518 for ( ; trailing; trailing--) {
519 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
520 break;
521 c <<= 6;
522 c |= d & 0x3F;
523 }
524
525 /* assertion: c is a single UTF-4 value */
526 if (c < 0x80) {
527 if (out >= outend)
528 break;
529 *out++ = c;
530 } else {
531 /* no chance for this in Ascii */
532 *outlen = out - outstart;
533 *inlen = processed - instart;
534 return(-2);
535 }
536 processed = in;
537 }
538 *outlen = out - outstart;
539 *inlen = processed - instart;
540 return(0);
541}
542
543/**
544 * isolat1ToUTF8:
545 * @out: a pointer to an array of bytes to store the result
546 * @outlen: the length of @out
547 * @in: a pointer to an array of ISO Latin 1 chars
548 * @inlen: the length of @in
549 *
550 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
551 * block of chars out.
552 * Returns 0 if success, or -1 otherwise
553 * The value of @inlen after return is the number of octets consumed
554 * as the return value is positive, else unpredictiable.
555 * The value of @outlen after return is the number of ocetes consumed.
556 */
557int
558isolat1ToUTF8(unsigned char* out, int *outlen,
559 const unsigned char* in, int *inlen) {
560 unsigned char* outstart = out;
561 const unsigned char* base = in;
562 const unsigned char* processed = in;
563 unsigned char* outend = out + *outlen;
564 const unsigned char* inend;
565 unsigned int c;
Owen Taylor3473f882001-02-23 17:55:21 +0000566
567 inend = in + (*inlen);
Daniel Veillard02141ea2001-04-30 11:46:40 +0000568 while (in < inend) {
569 c = *in++;
Owen Taylor3473f882001-02-23 17:55:21 +0000570
Owen Taylor3473f882001-02-23 17:55:21 +0000571 if (out >= outend)
572 break;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000573
574 if (c < 0x80) {
575 *out++ = c;
576 processed++;
577 continue;
578 } else {
579 *out++= ((c >> 6) & 0x1F) | 0xC0;
Owen Taylor3473f882001-02-23 17:55:21 +0000580 if (out >= outend)
Daniel Veillard02141ea2001-04-30 11:46:40 +0000581 break;
582 *out++= (c & 0x3F) | 0x80;
583 processed++;
Owen Taylor3473f882001-02-23 17:55:21 +0000584 }
Owen Taylor3473f882001-02-23 17:55:21 +0000585 }
586 *outlen = out - outstart;
587 *inlen = processed - base;
588 return(0);
589}
590
591/**
592 * UTF8Toisolat1:
593 * @out: a pointer to an array of bytes to store the result
594 * @outlen: the length of @out
595 * @in: a pointer to an array of UTF-8 chars
596 * @inlen: the length of @in
597 *
598 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
599 * block of chars out.
600 *
601 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
602 * The value of @inlen after return is the number of octets consumed
603 * as the return value is positive, else unpredictiable.
604 * The value of @outlen after return is the number of ocetes consumed.
605 */
606int
607UTF8Toisolat1(unsigned char* out, int *outlen,
608 const unsigned char* in, int *inlen) {
609 const unsigned char* processed = in;
610 const unsigned char* outend;
611 const unsigned char* outstart = out;
612 const unsigned char* instart = in;
613 const unsigned char* inend;
614 unsigned int c, d;
615 int trailing;
616
617 if (in == NULL) {
618 /*
619 * initialization nothing to do
620 */
621 *outlen = 0;
622 *inlen = 0;
623 return(0);
624 }
625 inend = in + (*inlen);
626 outend = out + (*outlen);
627 while (in < inend) {
628 d = *in++;
629 if (d < 0x80) { c= d; trailing= 0; }
630 else if (d < 0xC0) {
631 /* trailing byte in leading position */
632 *outlen = out - outstart;
633 *inlen = processed - instart;
634 return(-2);
635 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
636 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
637 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
638 else {
639 /* no chance for this in IsoLat1 */
640 *outlen = out - outstart;
641 *inlen = processed - instart;
642 return(-2);
643 }
644
645 if (inend - in < trailing) {
646 break;
647 }
648
649 for ( ; trailing; trailing--) {
650 if (in >= inend)
651 break;
652 if (((d= *in++) & 0xC0) != 0x80) {
653 *outlen = out - outstart;
654 *inlen = processed - instart;
655 return(-2);
656 }
657 c <<= 6;
658 c |= d & 0x3F;
659 }
660
661 /* assertion: c is a single UTF-4 value */
662 if (c <= 0xFF) {
663 if (out >= outend)
664 break;
665 *out++ = c;
666 } else {
667 /* no chance for this in IsoLat1 */
668 *outlen = out - outstart;
669 *inlen = processed - instart;
670 return(-2);
671 }
672 processed = in;
673 }
674 *outlen = out - outstart;
675 *inlen = processed - instart;
676 return(0);
677}
678
679/**
680 * UTF16LEToUTF8:
681 * @out: a pointer to an array of bytes to store the result
682 * @outlen: the length of @out
683 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
684 * @inlenb: the length of @in in UTF-16LE chars
685 *
686 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
687 * block of chars out. This function assume the endian properity
688 * is the same between the native type of this machine and the
689 * inputed one.
690 *
691 * Returns the number of byte written, or -1 by lack of space, or -2
692 * if the transcoding fails (for *in is not valid utf16 string)
693 * The value of *inlen after return is the number of octets consumed
694 * as the return value is positive, else unpredictiable.
695 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000696static int
Owen Taylor3473f882001-02-23 17:55:21 +0000697UTF16LEToUTF8(unsigned char* out, int *outlen,
698 const unsigned char* inb, int *inlenb)
699{
700 unsigned char* outstart = out;
701 const unsigned char* processed = inb;
702 unsigned char* outend = out + *outlen;
703 unsigned short* in = (unsigned short*) inb;
704 unsigned short* inend;
705 unsigned int c, d, inlen;
706 unsigned char *tmp;
707 int bits;
708
709 if ((*inlenb % 2) == 1)
710 (*inlenb)--;
711 inlen = *inlenb / 2;
712 inend = in + inlen;
713 while ((in < inend) && (out - outstart + 5 < *outlen)) {
714 if (xmlLittleEndian) {
715 c= *in++;
716 } else {
717 tmp = (unsigned char *) in;
718 c = *tmp++;
719 c = c | (((unsigned int)*tmp) << 8);
720 in++;
721 }
722 if ((c & 0xFC00) == 0xD800) { /* surrogates */
723 if (in >= inend) { /* (in > inend) shouldn't happens */
724 break;
725 }
726 if (xmlLittleEndian) {
727 d = *in++;
728 } else {
729 tmp = (unsigned char *) in;
730 d = *tmp++;
731 d = d | (((unsigned int)*tmp) << 8);
732 in++;
733 }
734 if ((d & 0xFC00) == 0xDC00) {
735 c &= 0x03FF;
736 c <<= 10;
737 c |= d & 0x03FF;
738 c += 0x10000;
739 }
740 else {
741 *outlen = out - outstart;
742 *inlenb = processed - inb;
743 return(-2);
744 }
745 }
746
747 /* assertion: c is a single UTF-4 value */
748 if (out >= outend)
749 break;
750 if (c < 0x80) { *out++= c; bits= -6; }
751 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
752 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
753 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
754
755 for ( ; bits >= 0; bits-= 6) {
756 if (out >= outend)
757 break;
758 *out++= ((c >> bits) & 0x3F) | 0x80;
759 }
760 processed = (const unsigned char*) in;
761 }
762 *outlen = out - outstart;
763 *inlenb = processed - inb;
764 return(0);
765}
766
767/**
768 * UTF8ToUTF16LE:
769 * @outb: a pointer to an array of bytes to store the result
770 * @outlen: the length of @outb
771 * @in: a pointer to an array of UTF-8 chars
772 * @inlen: the length of @in
773 *
774 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
775 * block of chars out.
776 *
777 * Returns the number of byte written, or -1 by lack of space, or -2
778 * if the transcoding failed.
779 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000780static int
Owen Taylor3473f882001-02-23 17:55:21 +0000781UTF8ToUTF16LE(unsigned char* outb, int *outlen,
782 const unsigned char* in, int *inlen)
783{
784 unsigned short* out = (unsigned short*) outb;
785 const unsigned char* processed = in;
786 unsigned short* outstart= out;
787 unsigned short* outend;
788 const unsigned char* inend= in+*inlen;
789 unsigned int c, d;
790 int trailing;
791 unsigned char *tmp;
792 unsigned short tmp1, tmp2;
793
794 if (in == NULL) {
795 /*
796 * initialization, add the Byte Order Mark
797 */
798 if (*outlen >= 2) {
799 outb[0] = 0xFF;
800 outb[1] = 0xFE;
801 *outlen = 2;
802 *inlen = 0;
803#ifdef DEBUG_ENCODING
804 xmlGenericError(xmlGenericErrorContext,
805 "Added FFFE Byte Order Mark\n");
806#endif
807 return(2);
808 }
809 *outlen = 0;
810 *inlen = 0;
811 return(0);
812 }
813 outend = out + (*outlen / 2);
814 while (in < inend) {
815 d= *in++;
816 if (d < 0x80) { c= d; trailing= 0; }
817 else if (d < 0xC0) {
818 /* trailing byte in leading position */
819 *outlen = (out - outstart) * 2;
820 *inlen = processed - in;
821 return(-2);
822 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
823 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
824 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
825 else {
826 /* no chance for this in UTF-16 */
827 *outlen = (out - outstart) * 2;
828 *inlen = processed - in;
829 return(-2);
830 }
831
832 if (inend - in < trailing) {
833 break;
834 }
835
836 for ( ; trailing; trailing--) {
837 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
838 break;
839 c <<= 6;
840 c |= d & 0x3F;
841 }
842
843 /* assertion: c is a single UTF-4 value */
844 if (c < 0x10000) {
845 if (out >= outend)
846 break;
847 if (xmlLittleEndian) {
848 *out++ = c;
849 } else {
850 tmp = (unsigned char *) out;
851 *tmp = c ;
852 *(tmp + 1) = c >> 8 ;
853 out++;
854 }
855 }
856 else if (c < 0x110000) {
857 if (out+1 >= outend)
858 break;
859 c -= 0x10000;
860 if (xmlLittleEndian) {
861 *out++ = 0xD800 | (c >> 10);
862 *out++ = 0xDC00 | (c & 0x03FF);
863 } else {
864 tmp1 = 0xD800 | (c >> 10);
865 tmp = (unsigned char *) out;
866 *tmp = (unsigned char) tmp1;
867 *(tmp + 1) = tmp1 >> 8;
868 out++;
869
870 tmp2 = 0xDC00 | (c & 0x03FF);
871 tmp = (unsigned char *) out;
872 *tmp = (unsigned char) tmp2;
873 *(tmp + 1) = tmp2 >> 8;
874 out++;
875 }
876 }
877 else
878 break;
879 processed = in;
880 }
881 *outlen = (out - outstart) * 2;
882 *inlen = processed - in;
883 return(0);
884}
885
886/**
887 * UTF16BEToUTF8:
888 * @out: a pointer to an array of bytes to store the result
889 * @outlen: the length of @out
890 * @inb: a pointer to an array of UTF-16 passwd as a byte array
891 * @inlenb: the length of @in in UTF-16 chars
892 *
893 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
894 * block of chars out. This function assume the endian properity
895 * is the same between the native type of this machine and the
896 * inputed one.
897 *
898 * Returns the number of byte written, or -1 by lack of space, or -2
899 * if the transcoding fails (for *in is not valid utf16 string)
900 * The value of *inlen after return is the number of octets consumed
901 * as the return value is positive, else unpredictiable.
902 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000903static int
Owen Taylor3473f882001-02-23 17:55:21 +0000904UTF16BEToUTF8(unsigned char* out, int *outlen,
905 const unsigned char* inb, int *inlenb)
906{
907 unsigned char* outstart = out;
908 const unsigned char* processed = inb;
909 unsigned char* outend = out + *outlen;
910 unsigned short* in = (unsigned short*) inb;
911 unsigned short* inend;
912 unsigned int c, d, inlen;
913 unsigned char *tmp;
914 int bits;
915
916 if ((*inlenb % 2) == 1)
917 (*inlenb)--;
918 inlen = *inlenb / 2;
919 inend= in + inlen;
920 while (in < inend) {
921 if (xmlLittleEndian) {
922 tmp = (unsigned char *) in;
923 c = *tmp++;
924 c = c << 8;
925 c = c | (unsigned int) *tmp;
926 in++;
927 } else {
928 c= *in++;
929 }
930 if ((c & 0xFC00) == 0xD800) { /* surrogates */
931 if (in >= inend) { /* (in > inend) shouldn't happens */
932 *outlen = out - outstart;
933 *inlenb = processed - inb;
934 return(-2);
935 }
936 if (xmlLittleEndian) {
937 tmp = (unsigned char *) in;
938 d = *tmp++;
939 d = d << 8;
940 d = d | (unsigned int) *tmp;
941 in++;
942 } else {
943 d= *in++;
944 }
945 if ((d & 0xFC00) == 0xDC00) {
946 c &= 0x03FF;
947 c <<= 10;
948 c |= d & 0x03FF;
949 c += 0x10000;
950 }
951 else {
952 *outlen = out - outstart;
953 *inlenb = processed - inb;
954 return(-2);
955 }
956 }
957
958 /* assertion: c is a single UTF-4 value */
959 if (out >= outend)
960 break;
961 if (c < 0x80) { *out++= c; bits= -6; }
962 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
963 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
964 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
965
966 for ( ; bits >= 0; bits-= 6) {
967 if (out >= outend)
968 break;
969 *out++= ((c >> bits) & 0x3F) | 0x80;
970 }
971 processed = (const unsigned char*) in;
972 }
973 *outlen = out - outstart;
974 *inlenb = processed - inb;
975 return(0);
976}
977
978/**
979 * UTF8ToUTF16BE:
980 * @outb: a pointer to an array of bytes to store the result
981 * @outlen: the length of @outb
982 * @in: a pointer to an array of UTF-8 chars
983 * @inlen: the length of @in
984 *
985 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
986 * block of chars out.
987 *
988 * Returns the number of byte written, or -1 by lack of space, or -2
989 * if the transcoding failed.
990 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000991static int
Owen Taylor3473f882001-02-23 17:55:21 +0000992UTF8ToUTF16BE(unsigned char* outb, int *outlen,
993 const unsigned char* in, int *inlen)
994{
995 unsigned short* out = (unsigned short*) outb;
996 const unsigned char* processed = in;
997 unsigned short* outstart= out;
998 unsigned short* outend;
999 const unsigned char* inend= in+*inlen;
1000 unsigned int c, d;
1001 int trailing;
1002 unsigned char *tmp;
1003 unsigned short tmp1, tmp2;
1004
1005 if (in == NULL) {
1006 /*
1007 * initialization, add the Byte Order Mark
1008 */
1009 if (*outlen >= 2) {
1010 outb[0] = 0xFE;
1011 outb[1] = 0xFF;
1012 *outlen = 2;
1013 *inlen = 0;
1014#ifdef DEBUG_ENCODING
1015 xmlGenericError(xmlGenericErrorContext,
1016 "Added FEFF Byte Order Mark\n");
1017#endif
1018 return(2);
1019 }
1020 *outlen = 0;
1021 *inlen = 0;
1022 return(0);
1023 }
1024 outend = out + (*outlen / 2);
1025 while (in < inend) {
1026 d= *in++;
1027 if (d < 0x80) { c= d; trailing= 0; }
1028 else if (d < 0xC0) {
1029 /* trailing byte in leading position */
1030 *outlen = out - outstart;
1031 *inlen = processed - in;
1032 return(-2);
1033 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1034 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1035 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1036 else {
1037 /* no chance for this in UTF-16 */
1038 *outlen = out - outstart;
1039 *inlen = processed - in;
1040 return(-2);
1041 }
1042
1043 if (inend - in < trailing) {
1044 break;
1045 }
1046
1047 for ( ; trailing; trailing--) {
1048 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1049 c <<= 6;
1050 c |= d & 0x3F;
1051 }
1052
1053 /* assertion: c is a single UTF-4 value */
1054 if (c < 0x10000) {
1055 if (out >= outend) break;
1056 if (xmlLittleEndian) {
1057 tmp = (unsigned char *) out;
1058 *tmp = c >> 8;
1059 *(tmp + 1) = c;
1060 out++;
1061 } else {
1062 *out++ = c;
1063 }
1064 }
1065 else if (c < 0x110000) {
1066 if (out+1 >= outend) break;
1067 c -= 0x10000;
1068 if (xmlLittleEndian) {
1069 tmp1 = 0xD800 | (c >> 10);
1070 tmp = (unsigned char *) out;
1071 *tmp = tmp1 >> 8;
1072 *(tmp + 1) = (unsigned char) tmp1;
1073 out++;
1074
1075 tmp2 = 0xDC00 | (c & 0x03FF);
1076 tmp = (unsigned char *) out;
1077 *tmp = tmp2 >> 8;
1078 *(tmp + 1) = (unsigned char) tmp2;
1079 out++;
1080 } else {
1081 *out++ = 0xD800 | (c >> 10);
1082 *out++ = 0xDC00 | (c & 0x03FF);
1083 }
1084 }
1085 else
1086 break;
1087 processed = in;
1088 }
1089 *outlen = (out - outstart) * 2;
1090 *inlen = processed - in;
1091 return(0);
1092}
1093
Daniel Veillard97ac1312001-05-30 19:14:17 +00001094/************************************************************************
1095 * *
1096 * Generic encoding handling routines *
1097 * *
1098 ************************************************************************/
1099
Owen Taylor3473f882001-02-23 17:55:21 +00001100/**
1101 * xmlDetectCharEncoding:
1102 * @in: a pointer to the first bytes of the XML entity, must be at least
1103 * 4 bytes long.
1104 * @len: pointer to the length of the buffer
1105 *
1106 * Guess the encoding of the entity using the first bytes of the entity content
1107 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1108 *
1109 * Returns one of the XML_CHAR_ENCODING_... values.
1110 */
1111xmlCharEncoding
1112xmlDetectCharEncoding(const unsigned char* in, int len)
1113{
1114 if (len >= 4) {
1115 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1116 (in[2] == 0x00) && (in[3] == 0x3C))
1117 return(XML_CHAR_ENCODING_UCS4BE);
1118 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1119 (in[2] == 0x00) && (in[3] == 0x00))
1120 return(XML_CHAR_ENCODING_UCS4LE);
1121 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1122 (in[2] == 0x3C) && (in[3] == 0x00))
1123 return(XML_CHAR_ENCODING_UCS4_2143);
1124 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1125 (in[2] == 0x00) && (in[3] == 0x00))
1126 return(XML_CHAR_ENCODING_UCS4_3412);
1127 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1128 (in[2] == 0xA7) && (in[3] == 0x94))
1129 return(XML_CHAR_ENCODING_EBCDIC);
1130 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1131 (in[2] == 0x78) && (in[3] == 0x6D))
1132 return(XML_CHAR_ENCODING_UTF8);
1133 }
Daniel Veillard87a764e2001-06-20 17:41:10 +00001134 if (len >= 3) {
1135 /*
1136 * Errata on XML-1.0 June 20 2001
1137 * We now allow an UTF8 encoded BOM
1138 */
1139 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1140 (in[2] == 0xBF))
1141 return(XML_CHAR_ENCODING_UTF8);
1142 }
Owen Taylor3473f882001-02-23 17:55:21 +00001143 if (len >= 2) {
1144 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1145 return(XML_CHAR_ENCODING_UTF16BE);
1146 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1147 return(XML_CHAR_ENCODING_UTF16LE);
1148 }
1149 return(XML_CHAR_ENCODING_NONE);
1150}
1151
1152/**
1153 * xmlCleanupEncodingAliases:
1154 *
1155 * Unregisters all aliases
1156 */
1157void
1158xmlCleanupEncodingAliases(void) {
1159 int i;
1160
1161 if (xmlCharEncodingAliases == NULL)
1162 return;
1163
1164 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1165 if (xmlCharEncodingAliases[i].name != NULL)
1166 xmlFree((char *) xmlCharEncodingAliases[i].name);
1167 if (xmlCharEncodingAliases[i].alias != NULL)
1168 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1169 }
1170 xmlCharEncodingAliasesNb = 0;
1171 xmlCharEncodingAliasesMax = 0;
1172 xmlFree(xmlCharEncodingAliases);
1173}
1174
1175/**
1176 * xmlGetEncodingAlias:
1177 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1178 *
1179 * Lookup an encoding name for the given alias.
1180 *
1181 * Returns NULL if not found the original name otherwise
1182 */
1183const char *
1184xmlGetEncodingAlias(const char *alias) {
1185 int i;
1186 char upper[100];
1187
1188 if (alias == NULL)
1189 return(NULL);
1190
1191 if (xmlCharEncodingAliases == NULL)
1192 return(NULL);
1193
1194 for (i = 0;i < 99;i++) {
1195 upper[i] = toupper(alias[i]);
1196 if (upper[i] == 0) break;
1197 }
1198 upper[i] = 0;
1199
1200 /*
1201 * Walk down the list looking for a definition of the alias
1202 */
1203 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1204 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1205 return(xmlCharEncodingAliases[i].name);
1206 }
1207 }
1208 return(NULL);
1209}
1210
1211/**
1212 * xmlAddEncodingAlias:
1213 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1214 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1215 *
1216 * Registers and alias @alias for an encoding named @name. Existing alias
1217 * will be overwritten.
1218 *
1219 * Returns 0 in case of success, -1 in case of error
1220 */
1221int
1222xmlAddEncodingAlias(const char *name, const char *alias) {
1223 int i;
1224 char upper[100];
1225
1226 if ((name == NULL) || (alias == NULL))
1227 return(-1);
1228
1229 for (i = 0;i < 99;i++) {
1230 upper[i] = toupper(alias[i]);
1231 if (upper[i] == 0) break;
1232 }
1233 upper[i] = 0;
1234
1235 if (xmlCharEncodingAliases == NULL) {
1236 xmlCharEncodingAliasesNb = 0;
1237 xmlCharEncodingAliasesMax = 20;
1238 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1239 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1240 if (xmlCharEncodingAliases == NULL)
1241 return(-1);
1242 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1243 xmlCharEncodingAliasesMax *= 2;
1244 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1245 xmlRealloc(xmlCharEncodingAliases,
1246 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1247 }
1248 /*
1249 * Walk down the list looking for a definition of the alias
1250 */
1251 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1252 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1253 /*
1254 * Replace the definition.
1255 */
1256 xmlFree((char *) xmlCharEncodingAliases[i].name);
1257 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1258 return(0);
1259 }
1260 }
1261 /*
1262 * Add the definition
1263 */
1264 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1265 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1266 xmlCharEncodingAliasesNb++;
1267 return(0);
1268}
1269
1270/**
1271 * xmlDelEncodingAlias:
1272 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1273 *
1274 * Unregisters an encoding alias @alias
1275 *
1276 * Returns 0 in case of success, -1 in case of error
1277 */
1278int
1279xmlDelEncodingAlias(const char *alias) {
1280 int i;
1281
1282 if (alias == NULL)
1283 return(-1);
1284
1285 if (xmlCharEncodingAliases == NULL)
1286 return(-1);
1287 /*
1288 * Walk down the list looking for a definition of the alias
1289 */
1290 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1291 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1292 xmlFree((char *) xmlCharEncodingAliases[i].name);
1293 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1294 xmlCharEncodingAliasesNb--;
1295 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1296 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1297 return(0);
1298 }
1299 }
1300 return(-1);
1301}
1302
1303/**
1304 * xmlParseCharEncoding:
1305 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1306 *
1307 * Conpare the string to the known encoding schemes already known. Note
1308 * that the comparison is case insensitive accordingly to the section
1309 * [XML] 4.3.3 Character Encoding in Entities.
1310 *
1311 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1312 * if not recognized.
1313 */
1314xmlCharEncoding
1315xmlParseCharEncoding(const char* name)
1316{
1317 const char *alias;
1318 char upper[500];
1319 int i;
1320
1321 if (name == NULL)
1322 return(XML_CHAR_ENCODING_NONE);
1323
1324 /*
1325 * Do the alias resolution
1326 */
1327 alias = xmlGetEncodingAlias(name);
1328 if (alias != NULL)
1329 name = alias;
1330
1331 for (i = 0;i < 499;i++) {
1332 upper[i] = toupper(name[i]);
1333 if (upper[i] == 0) break;
1334 }
1335 upper[i] = 0;
1336
1337 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1338 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1339 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1340
1341 /*
1342 * NOTE: if we were able to parse this, the endianness of UTF16 is
1343 * already found and in use
1344 */
1345 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1346 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1347
1348 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1349 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1350 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1351
1352 /*
1353 * NOTE: if we were able to parse this, the endianness of UCS4 is
1354 * already found and in use
1355 */
1356 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1357 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1358 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1359
1360
1361 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1362 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1363 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1364
1365 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1366 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1367 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1368
1369 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1370 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1371 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1372 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1373 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1374 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1375 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1376
1377 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1378 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1379 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1380
1381#ifdef DEBUG_ENCODING
1382 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1383#endif
1384 return(XML_CHAR_ENCODING_ERROR);
1385}
1386
1387/**
1388 * xmlGetCharEncodingName:
1389 * @enc: the encoding
1390 *
1391 * The "canonical" name for XML encoding.
1392 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1393 * Section 4.3.3 Character Encoding in Entities
1394 *
1395 * Returns the canonical name for the given encoding
1396 */
1397
1398const char*
1399xmlGetCharEncodingName(xmlCharEncoding enc) {
1400 switch (enc) {
1401 case XML_CHAR_ENCODING_ERROR:
1402 return(NULL);
1403 case XML_CHAR_ENCODING_NONE:
1404 return(NULL);
1405 case XML_CHAR_ENCODING_UTF8:
1406 return("UTF-8");
1407 case XML_CHAR_ENCODING_UTF16LE:
1408 return("UTF-16");
1409 case XML_CHAR_ENCODING_UTF16BE:
1410 return("UTF-16");
1411 case XML_CHAR_ENCODING_EBCDIC:
1412 return("EBCDIC");
1413 case XML_CHAR_ENCODING_UCS4LE:
1414 return("ISO-10646-UCS-4");
1415 case XML_CHAR_ENCODING_UCS4BE:
1416 return("ISO-10646-UCS-4");
1417 case XML_CHAR_ENCODING_UCS4_2143:
1418 return("ISO-10646-UCS-4");
1419 case XML_CHAR_ENCODING_UCS4_3412:
1420 return("ISO-10646-UCS-4");
1421 case XML_CHAR_ENCODING_UCS2:
1422 return("ISO-10646-UCS-2");
1423 case XML_CHAR_ENCODING_8859_1:
1424 return("ISO-8859-1");
1425 case XML_CHAR_ENCODING_8859_2:
1426 return("ISO-8859-2");
1427 case XML_CHAR_ENCODING_8859_3:
1428 return("ISO-8859-3");
1429 case XML_CHAR_ENCODING_8859_4:
1430 return("ISO-8859-4");
1431 case XML_CHAR_ENCODING_8859_5:
1432 return("ISO-8859-5");
1433 case XML_CHAR_ENCODING_8859_6:
1434 return("ISO-8859-6");
1435 case XML_CHAR_ENCODING_8859_7:
1436 return("ISO-8859-7");
1437 case XML_CHAR_ENCODING_8859_8:
1438 return("ISO-8859-8");
1439 case XML_CHAR_ENCODING_8859_9:
1440 return("ISO-8859-9");
1441 case XML_CHAR_ENCODING_2022_JP:
1442 return("ISO-2022-JP");
1443 case XML_CHAR_ENCODING_SHIFT_JIS:
1444 return("Shift-JIS");
1445 case XML_CHAR_ENCODING_EUC_JP:
1446 return("EUC-JP");
1447 case XML_CHAR_ENCODING_ASCII:
1448 return(NULL);
1449 }
1450 return(NULL);
1451}
1452
Daniel Veillard97ac1312001-05-30 19:14:17 +00001453/************************************************************************
1454 * *
1455 * Char encoding handlers *
1456 * *
1457 ************************************************************************/
1458
Owen Taylor3473f882001-02-23 17:55:21 +00001459
1460/* the size should be growable, but it's not a big deal ... */
1461#define MAX_ENCODING_HANDLERS 50
1462static xmlCharEncodingHandlerPtr *handlers = NULL;
1463static int nbCharEncodingHandler = 0;
1464
1465/*
1466 * The default is UTF-8 for XML, that's also the default used for the
1467 * parser internals, so the default encoding handler is NULL
1468 */
1469
1470static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1471
1472/**
1473 * xmlNewCharEncodingHandler:
1474 * @name: the encoding name, in UTF-8 format (ASCII actually)
1475 * @input: the xmlCharEncodingInputFunc to read that encoding
1476 * @output: the xmlCharEncodingOutputFunc to write that encoding
1477 *
1478 * Create and registers an xmlCharEncodingHandler.
1479 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1480 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001481static xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001482xmlNewCharEncodingHandler(const char *name,
1483 xmlCharEncodingInputFunc input,
1484 xmlCharEncodingOutputFunc output) {
1485 xmlCharEncodingHandlerPtr handler;
1486 const char *alias;
1487 char upper[500];
1488 int i;
1489 char *up = 0;
1490
1491 /*
1492 * Do the alias resolution
1493 */
1494 alias = xmlGetEncodingAlias(name);
1495 if (alias != NULL)
1496 name = alias;
1497
1498 /*
1499 * Keep only the uppercase version of the encoding.
1500 */
1501 if (name == NULL) {
1502 xmlGenericError(xmlGenericErrorContext,
1503 "xmlNewCharEncodingHandler : no name !\n");
1504 return(NULL);
1505 }
1506 for (i = 0;i < 499;i++) {
1507 upper[i] = toupper(name[i]);
1508 if (upper[i] == 0) break;
1509 }
1510 upper[i] = 0;
1511 up = xmlMemStrdup(upper);
1512 if (up == NULL) {
1513 xmlGenericError(xmlGenericErrorContext,
1514 "xmlNewCharEncodingHandler : out of memory !\n");
1515 return(NULL);
1516 }
1517
1518 /*
1519 * allocate and fill-up an handler block.
1520 */
1521 handler = (xmlCharEncodingHandlerPtr)
1522 xmlMalloc(sizeof(xmlCharEncodingHandler));
1523 if (handler == NULL) {
1524 xmlGenericError(xmlGenericErrorContext,
1525 "xmlNewCharEncodingHandler : out of memory !\n");
1526 return(NULL);
1527 }
1528 handler->input = input;
1529 handler->output = output;
1530 handler->name = up;
1531
1532#ifdef LIBXML_ICONV_ENABLED
1533 handler->iconv_in = NULL;
1534 handler->iconv_out = NULL;
1535#endif /* LIBXML_ICONV_ENABLED */
1536
1537 /*
1538 * registers and returns the handler.
1539 */
1540 xmlRegisterCharEncodingHandler(handler);
1541#ifdef DEBUG_ENCODING
1542 xmlGenericError(xmlGenericErrorContext,
1543 "Registered encoding handler for %s\n", name);
1544#endif
1545 return(handler);
1546}
1547
1548/**
1549 * xmlInitCharEncodingHandlers:
1550 *
1551 * Initialize the char encoding support, it registers the default
1552 * encoding supported.
1553 * NOTE: while public, this function usually doesn't need to be called
1554 * in normal processing.
1555 */
1556void
1557xmlInitCharEncodingHandlers(void) {
1558 unsigned short int tst = 0x1234;
1559 unsigned char *ptr = (unsigned char *) &tst;
1560
1561 if (handlers != NULL) return;
1562
1563 handlers = (xmlCharEncodingHandlerPtr *)
1564 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1565
1566 if (*ptr == 0x12) xmlLittleEndian = 0;
1567 else if (*ptr == 0x34) xmlLittleEndian = 1;
1568 else xmlGenericError(xmlGenericErrorContext,
1569 "Odd problem at endianness detection\n");
1570
1571 if (handlers == NULL) {
1572 xmlGenericError(xmlGenericErrorContext,
1573 "xmlInitCharEncodingHandlers : out of memory !\n");
1574 return;
1575 }
1576 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1577 xmlUTF16LEHandler =
1578 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1579 xmlUTF16BEHandler =
1580 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1581 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1582 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001583 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001584#ifdef LIBXML_HTML_ENABLED
1585 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1586#endif
1587}
1588
1589/**
1590 * xmlCleanupCharEncodingHandlers:
1591 *
1592 * Cleanup the memory allocated for the char encoding support, it
1593 * unregisters all the encoding handlers and the aliases.
1594 */
1595void
1596xmlCleanupCharEncodingHandlers(void) {
1597 xmlCleanupEncodingAliases();
1598
1599 if (handlers == NULL) return;
1600
1601 for (;nbCharEncodingHandler > 0;) {
1602 nbCharEncodingHandler--;
1603 if (handlers[nbCharEncodingHandler] != NULL) {
1604 if (handlers[nbCharEncodingHandler]->name != NULL)
1605 xmlFree(handlers[nbCharEncodingHandler]->name);
1606 xmlFree(handlers[nbCharEncodingHandler]);
1607 }
1608 }
1609 xmlFree(handlers);
1610 handlers = NULL;
1611 nbCharEncodingHandler = 0;
1612 xmlDefaultCharEncodingHandler = NULL;
1613}
1614
1615/**
1616 * xmlRegisterCharEncodingHandler:
1617 * @handler: the xmlCharEncodingHandlerPtr handler block
1618 *
1619 * Register the char encoding handler, surprizing, isn't it ?
1620 */
1621void
1622xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1623 if (handlers == NULL) xmlInitCharEncodingHandlers();
1624 if (handler == NULL) {
1625 xmlGenericError(xmlGenericErrorContext,
1626 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1627 return;
1628 }
1629
1630 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1631 xmlGenericError(xmlGenericErrorContext,
1632 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1633 xmlGenericError(xmlGenericErrorContext,
1634 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1635 return;
1636 }
1637 handlers[nbCharEncodingHandler++] = handler;
1638}
1639
1640/**
1641 * xmlGetCharEncodingHandler:
1642 * @enc: an xmlCharEncoding value.
1643 *
1644 * Search in the registrered set the handler able to read/write that encoding.
1645 *
1646 * Returns the handler or NULL if not found
1647 */
1648xmlCharEncodingHandlerPtr
1649xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1650 xmlCharEncodingHandlerPtr handler;
1651
1652 if (handlers == NULL) xmlInitCharEncodingHandlers();
1653 switch (enc) {
1654 case XML_CHAR_ENCODING_ERROR:
1655 return(NULL);
1656 case XML_CHAR_ENCODING_NONE:
1657 return(NULL);
1658 case XML_CHAR_ENCODING_UTF8:
1659 return(NULL);
1660 case XML_CHAR_ENCODING_UTF16LE:
1661 return(xmlUTF16LEHandler);
1662 case XML_CHAR_ENCODING_UTF16BE:
1663 return(xmlUTF16BEHandler);
1664 case XML_CHAR_ENCODING_EBCDIC:
1665 handler = xmlFindCharEncodingHandler("EBCDIC");
1666 if (handler != NULL) return(handler);
1667 handler = xmlFindCharEncodingHandler("ebcdic");
1668 if (handler != NULL) return(handler);
1669 break;
1670 case XML_CHAR_ENCODING_UCS4BE:
1671 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1672 if (handler != NULL) return(handler);
1673 handler = xmlFindCharEncodingHandler("UCS-4");
1674 if (handler != NULL) return(handler);
1675 handler = xmlFindCharEncodingHandler("UCS4");
1676 if (handler != NULL) return(handler);
1677 break;
1678 case XML_CHAR_ENCODING_UCS4LE:
1679 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1680 if (handler != NULL) return(handler);
1681 handler = xmlFindCharEncodingHandler("UCS-4");
1682 if (handler != NULL) return(handler);
1683 handler = xmlFindCharEncodingHandler("UCS4");
1684 if (handler != NULL) return(handler);
1685 break;
1686 case XML_CHAR_ENCODING_UCS4_2143:
1687 break;
1688 case XML_CHAR_ENCODING_UCS4_3412:
1689 break;
1690 case XML_CHAR_ENCODING_UCS2:
1691 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1692 if (handler != NULL) return(handler);
1693 handler = xmlFindCharEncodingHandler("UCS-2");
1694 if (handler != NULL) return(handler);
1695 handler = xmlFindCharEncodingHandler("UCS2");
1696 if (handler != NULL) return(handler);
1697 break;
1698
1699 /*
1700 * We used to keep ISO Latin encodings native in the
1701 * generated data. This led to so many problems that
1702 * this has been removed. One can still change this
1703 * back by registering no-ops encoders for those
1704 */
1705 case XML_CHAR_ENCODING_8859_1:
1706 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1707 if (handler != NULL) return(handler);
1708 break;
1709 case XML_CHAR_ENCODING_8859_2:
1710 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1711 if (handler != NULL) return(handler);
1712 break;
1713 case XML_CHAR_ENCODING_8859_3:
1714 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1715 if (handler != NULL) return(handler);
1716 break;
1717 case XML_CHAR_ENCODING_8859_4:
1718 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1719 if (handler != NULL) return(handler);
1720 break;
1721 case XML_CHAR_ENCODING_8859_5:
1722 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1723 if (handler != NULL) return(handler);
1724 break;
1725 case XML_CHAR_ENCODING_8859_6:
1726 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1727 if (handler != NULL) return(handler);
1728 break;
1729 case XML_CHAR_ENCODING_8859_7:
1730 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1731 if (handler != NULL) return(handler);
1732 break;
1733 case XML_CHAR_ENCODING_8859_8:
1734 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1735 if (handler != NULL) return(handler);
1736 break;
1737 case XML_CHAR_ENCODING_8859_9:
1738 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1739 if (handler != NULL) return(handler);
1740 break;
1741
1742
1743 case XML_CHAR_ENCODING_2022_JP:
1744 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1745 if (handler != NULL) return(handler);
1746 break;
1747 case XML_CHAR_ENCODING_SHIFT_JIS:
1748 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1749 if (handler != NULL) return(handler);
1750 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1751 if (handler != NULL) return(handler);
1752 handler = xmlFindCharEncodingHandler("Shift_JIS");
1753 if (handler != NULL) return(handler);
1754 break;
1755 case XML_CHAR_ENCODING_EUC_JP:
1756 handler = xmlFindCharEncodingHandler("EUC-JP");
1757 if (handler != NULL) return(handler);
1758 break;
1759 default:
1760 break;
1761 }
1762
1763#ifdef DEBUG_ENCODING
1764 xmlGenericError(xmlGenericErrorContext,
1765 "No handler found for encoding %d\n", enc);
1766#endif
1767 return(NULL);
1768}
1769
1770/**
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001771 * xmlFindCharEncodingHandler:
1772 * @name: a string describing the char encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001773 *
1774 * Search in the registrered set the handler able to read/write that encoding.
1775 *
1776 * Returns the handler or NULL if not found
1777 */
1778xmlCharEncodingHandlerPtr
1779xmlFindCharEncodingHandler(const char *name) {
1780 const char *nalias;
1781 const char *norig;
1782 xmlCharEncoding alias;
1783#ifdef LIBXML_ICONV_ENABLED
1784 xmlCharEncodingHandlerPtr enc;
1785 iconv_t icv_in, icv_out;
1786#endif /* LIBXML_ICONV_ENABLED */
1787 char upper[100];
1788 int i;
1789
1790 if (handlers == NULL) xmlInitCharEncodingHandlers();
1791 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1792 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1793
1794 /*
1795 * Do the alias resolution
1796 */
1797 norig = name;
1798 nalias = xmlGetEncodingAlias(name);
1799 if (nalias != NULL)
1800 name = nalias;
1801
1802 /*
1803 * Check first for directly registered encoding names
1804 */
1805 for (i = 0;i < 99;i++) {
1806 upper[i] = toupper(name[i]);
1807 if (upper[i] == 0) break;
1808 }
1809 upper[i] = 0;
1810
1811 for (i = 0;i < nbCharEncodingHandler; i++)
1812 if (!strcmp(upper, handlers[i]->name)) {
1813#ifdef DEBUG_ENCODING
1814 xmlGenericError(xmlGenericErrorContext,
1815 "Found registered handler for encoding %s\n", name);
1816#endif
1817 return(handlers[i]);
1818 }
1819
1820#ifdef LIBXML_ICONV_ENABLED
1821 /* check whether iconv can handle this */
1822 icv_in = iconv_open("UTF-8", name);
1823 icv_out = iconv_open(name, "UTF-8");
1824 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1825 enc = (xmlCharEncodingHandlerPtr)
1826 xmlMalloc(sizeof(xmlCharEncodingHandler));
1827 if (enc == NULL) {
1828 iconv_close(icv_in);
1829 iconv_close(icv_out);
1830 return(NULL);
1831 }
1832 enc->name = xmlMemStrdup(name);
1833 enc->input = NULL;
1834 enc->output = NULL;
1835 enc->iconv_in = icv_in;
1836 enc->iconv_out = icv_out;
1837#ifdef DEBUG_ENCODING
1838 xmlGenericError(xmlGenericErrorContext,
1839 "Found iconv handler for encoding %s\n", name);
1840#endif
1841 return enc;
1842 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1843 xmlGenericError(xmlGenericErrorContext,
1844 "iconv : problems with filters for '%s'\n", name);
1845 }
1846#endif /* LIBXML_ICONV_ENABLED */
1847
1848#ifdef DEBUG_ENCODING
1849 xmlGenericError(xmlGenericErrorContext,
1850 "No handler found for encoding %s\n", name);
1851#endif
1852
1853 /*
1854 * Fallback using the canonical names
1855 */
1856 alias = xmlParseCharEncoding(norig);
1857 if (alias != XML_CHAR_ENCODING_ERROR) {
1858 const char* canon;
1859 canon = xmlGetCharEncodingName(alias);
1860 if ((canon != NULL) && (strcmp(name, canon))) {
1861 return(xmlFindCharEncodingHandler(canon));
1862 }
1863 }
1864
1865 return(NULL);
1866}
1867
Daniel Veillard97ac1312001-05-30 19:14:17 +00001868/************************************************************************
1869 * *
1870 * ICONV based generic conversion functions *
1871 * *
1872 ************************************************************************/
1873
Owen Taylor3473f882001-02-23 17:55:21 +00001874#ifdef LIBXML_ICONV_ENABLED
1875/**
1876 * xmlIconvWrapper:
1877 * @cd: iconv converter data structure
1878 * @out: a pointer to an array of bytes to store the result
1879 * @outlen: the length of @out
1880 * @in: a pointer to an array of ISO Latin 1 chars
1881 * @inlen: the length of @in
1882 *
1883 * Returns 0 if success, or
1884 * -1 by lack of space, or
1885 * -2 if the transcoding fails (for *in is not valid utf8 string or
1886 * the result of transformation can't fit into the encoding we want), or
1887 * -3 if there the last byte can't form a single output char.
1888 *
1889 * The value of @inlen after return is the number of octets consumed
1890 * as the return value is positive, else unpredictiable.
1891 * The value of @outlen after return is the number of ocetes consumed.
1892 */
1893static int
1894xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001895 unsigned char *out, int *outlen,
1896 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001897
Daniel Veillard9403a042001-05-28 11:00:53 +00001898 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1899 const char *icv_in = (const char *) in;
1900 char *icv_out = (char *) out;
1901 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001902
Darin Adler699613b2001-07-27 22:47:14 +00001903 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard9403a042001-05-28 11:00:53 +00001904 if (in != NULL) {
1905 *inlen -= icv_inlen;
1906 *outlen -= icv_outlen;
1907 } else {
1908 *inlen = 0;
1909 *outlen = 0;
1910 }
1911 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001912#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00001913 if (errno == EILSEQ) {
1914 return -2;
1915 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001916#endif
1917#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00001918 if (errno == E2BIG) {
1919 return -1;
1920 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001921#endif
1922#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00001923 if (errno == EINVAL) {
1924 return -3;
1925 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001926#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00001927 {
1928 return -3;
1929 }
1930 }
1931 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001932}
1933#endif /* LIBXML_ICONV_ENABLED */
1934
Daniel Veillard97ac1312001-05-30 19:14:17 +00001935/************************************************************************
1936 * *
1937 * The real API used by libxml for on-the-fly conversion *
1938 * *
1939 ************************************************************************/
1940
Owen Taylor3473f882001-02-23 17:55:21 +00001941/**
1942 * xmlCharEncFirstLine:
1943 * @handler: char enconding transformation data structure
1944 * @out: an xmlBuffer for the output.
1945 * @in: an xmlBuffer for the input
1946 *
1947 * Front-end for the encoding handler input function, but handle only
1948 * the very first line, i.e. limit itself to 45 chars.
1949 *
1950 * Returns the number of byte written if success, or
1951 * -1 general error
1952 * -2 if the transcoding fails (for *in is not valid utf8 string or
1953 * the result of transformation can't fit into the encoding we want), or
1954 */
1955int
1956xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1957 xmlBufferPtr in) {
1958 int ret = -2;
1959 int written;
1960 int toconv;
1961
1962 if (handler == NULL) return(-1);
1963 if (out == NULL) return(-1);
1964 if (in == NULL) return(-1);
1965
1966 written = out->size - out->use;
1967 toconv = in->use;
1968 if (toconv * 2 >= written) {
1969 xmlBufferGrow(out, toconv);
1970 written = out->size - out->use - 1;
1971 }
1972
1973 /*
1974 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1975 * 45 chars should be sufficient to reach the end of the encoding
1976 * decalration without going too far inside the document content.
1977 */
1978 written = 45;
1979
1980 if (handler->input != NULL) {
1981 ret = handler->input(&out->content[out->use], &written,
1982 in->content, &toconv);
1983 xmlBufferShrink(in, toconv);
1984 out->use += written;
1985 out->content[out->use] = 0;
1986 }
1987#ifdef LIBXML_ICONV_ENABLED
1988 else if (handler->iconv_in != NULL) {
1989 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1990 &written, in->content, &toconv);
1991 xmlBufferShrink(in, toconv);
1992 out->use += written;
1993 out->content[out->use] = 0;
1994 if (ret == -1) ret = -3;
1995 }
1996#endif /* LIBXML_ICONV_ENABLED */
1997#ifdef DEBUG_ENCODING
1998 switch (ret) {
1999 case 0:
2000 xmlGenericError(xmlGenericErrorContext,
2001 "converted %d bytes to %d bytes of input\n",
2002 toconv, written);
2003 break;
2004 case -1:
2005 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2006 toconv, written, in->use);
2007 break;
2008 case -2:
2009 xmlGenericError(xmlGenericErrorContext,
2010 "input conversion failed due to input error\n");
2011 break;
2012 case -3:
2013 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2014 toconv, written, in->use);
2015 break;
2016 default:
2017 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2018 }
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002019#endif /* DEBUG_ENCODING */
Owen Taylor3473f882001-02-23 17:55:21 +00002020 /*
2021 * Ignore when input buffer is not on a boundary
2022 */
2023 if (ret == -3) ret = 0;
2024 if (ret == -1) ret = 0;
2025 return(ret);
2026}
2027
2028/**
2029 * xmlCharEncInFunc:
2030 * @handler: char enconding transformation data structure
2031 * @out: an xmlBuffer for the output.
2032 * @in: an xmlBuffer for the input
2033 *
2034 * Generic front-end for the encoding handler input function
2035 *
2036 * Returns the number of byte written if success, or
2037 * -1 general error
2038 * -2 if the transcoding fails (for *in is not valid utf8 string or
2039 * the result of transformation can't fit into the encoding we want), or
2040 */
2041int
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002042xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2043 xmlBufferPtr in)
2044{
Owen Taylor3473f882001-02-23 17:55:21 +00002045 int ret = -2;
2046 int written;
2047 int toconv;
2048
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002049 if (handler == NULL)
2050 return (-1);
2051 if (out == NULL)
2052 return (-1);
2053 if (in == NULL)
2054 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00002055
2056 toconv = in->use;
2057 if (toconv == 0)
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002058 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00002059 written = out->size - out->use;
2060 if (toconv * 2 >= written) {
2061 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002062 written = out->size - out->use - 1;
Owen Taylor3473f882001-02-23 17:55:21 +00002063 }
2064 if (handler->input != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002065 ret = handler->input(&out->content[out->use], &written,
2066 in->content, &toconv);
2067 xmlBufferShrink(in, toconv);
2068 out->use += written;
2069 out->content[out->use] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002070 }
2071#ifdef LIBXML_ICONV_ENABLED
2072 else if (handler->iconv_in != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002073 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2074 &written, in->content, &toconv);
2075 xmlBufferShrink(in, toconv);
2076 out->use += written;
2077 out->content[out->use] = 0;
2078 if (ret == -1)
2079 ret = -3;
Owen Taylor3473f882001-02-23 17:55:21 +00002080 }
2081#endif /* LIBXML_ICONV_ENABLED */
2082 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002083 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002084#ifdef DEBUG_ENCODING
2085 xmlGenericError(xmlGenericErrorContext,
2086 "converted %d bytes to %d bytes of input\n",
2087 toconv, written);
Owen Taylor3473f882001-02-23 17:55:21 +00002088#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002089 break;
2090 case -1:
2091#ifdef DEBUG_ENCODING
2092 xmlGenericError(xmlGenericErrorContext,
2093 "converted %d bytes to %d bytes of input, %d left\n",
2094 toconv, written, in->use);
2095#endif
2096 break;
2097 case -3:
2098#ifdef DEBUG_ENCODING
2099 xmlGenericError(xmlGenericErrorContext,
2100 "converted %d bytes to %d bytes of input, %d left\n",
2101 toconv, written, in->use);
2102#endif
2103 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002104 case -2:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002105 xmlGenericError(xmlGenericErrorContext,
2106 "input conversion failed due to input error\n");
2107 xmlGenericError(xmlGenericErrorContext,
2108 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2109 in->content[0], in->content[1],
2110 in->content[2], in->content[3]);
Owen Taylor3473f882001-02-23 17:55:21 +00002111 }
2112 /*
2113 * Ignore when input buffer is not on a boundary
2114 */
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002115 if (ret == -3)
2116 ret = 0;
2117 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00002118}
2119
2120/**
2121 * xmlCharEncOutFunc:
2122 * @handler: char enconding transformation data structure
2123 * @out: an xmlBuffer for the output.
2124 * @in: an xmlBuffer for the input
2125 *
2126 * Generic front-end for the encoding handler output function
2127 * a first call with @in == NULL has to be made firs to initiate the
2128 * output in case of non-stateless encoding needing to initiate their
2129 * state or the output (like the BOM in UTF16).
2130 * In case of UTF8 sequence conversion errors for the given encoder,
2131 * the content will be automatically remapped to a CharRef sequence.
2132 *
2133 * Returns the number of byte written if success, or
2134 * -1 general error
2135 * -2 if the transcoding fails (for *in is not valid utf8 string or
2136 * the result of transformation can't fit into the encoding we want), or
2137 */
2138int
2139xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2140 xmlBufferPtr in) {
2141 int ret = -2;
2142 int written;
2143 int writtentot = 0;
2144 int toconv;
2145 int output = 0;
2146
2147 if (handler == NULL) return(-1);
2148 if (out == NULL) return(-1);
2149
2150retry:
2151
2152 written = out->size - out->use;
2153
2154 /*
2155 * First specific handling of in = NULL, i.e. the initialization call
2156 */
2157 if (in == NULL) {
2158 toconv = 0;
2159 if (handler->output != NULL) {
2160 ret = handler->output(&out->content[out->use], &written,
2161 NULL, &toconv);
2162 out->use += written;
2163 out->content[out->use] = 0;
2164 }
2165#ifdef LIBXML_ICONV_ENABLED
2166 else if (handler->iconv_out != NULL) {
2167 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2168 &written, NULL, &toconv);
2169 out->use += written;
2170 out->content[out->use] = 0;
2171 }
2172#endif /* LIBXML_ICONV_ENABLED */
2173#ifdef DEBUG_ENCODING
2174 xmlGenericError(xmlGenericErrorContext,
2175 "initialized encoder\n");
2176#endif
2177 return(0);
2178 }
2179
2180 /*
2181 * Convertion itself.
2182 */
2183 toconv = in->use;
2184 if (toconv == 0)
2185 return(0);
2186 if (toconv * 2 >= written) {
2187 xmlBufferGrow(out, toconv * 2);
2188 written = out->size - out->use - 1;
2189 }
2190 if (handler->output != NULL) {
2191 ret = handler->output(&out->content[out->use], &written,
2192 in->content, &toconv);
2193 xmlBufferShrink(in, toconv);
2194 out->use += written;
2195 writtentot += written;
2196 out->content[out->use] = 0;
2197 }
2198#ifdef LIBXML_ICONV_ENABLED
2199 else if (handler->iconv_out != NULL) {
2200 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2201 &written, in->content, &toconv);
2202 xmlBufferShrink(in, toconv);
2203 out->use += written;
2204 writtentot += written;
2205 out->content[out->use] = 0;
2206 if (ret == -1) {
2207 if (written > 0) {
2208 /*
2209 * Can be a limitation of iconv
2210 */
2211 goto retry;
2212 }
2213 ret = -3;
2214 }
2215 }
2216#endif /* LIBXML_ICONV_ENABLED */
2217 else {
2218 xmlGenericError(xmlGenericErrorContext,
2219 "xmlCharEncOutFunc: no output function !\n");
2220 return(-1);
2221 }
2222
2223 if (ret >= 0) output += ret;
2224
2225 /*
2226 * Attempt to handle error cases
2227 */
2228 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002229 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002230#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002231 xmlGenericError(xmlGenericErrorContext,
2232 "converted %d bytes to %d bytes of output\n",
2233 toconv, written);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002234#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002235 break;
2236 case -1:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002237#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002238 xmlGenericError(xmlGenericErrorContext,
2239 "output conversion failed by lack of space\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002240#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002241 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002242 case -3:
2243 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2244 toconv, written, in->use);
2245 break;
2246 case -2: {
2247 int len = in->use;
2248 const xmlChar *utf = (const xmlChar *) in->content;
2249 int cur;
2250
2251 cur = xmlGetUTF8Char(utf, &len);
2252 if (cur > 0) {
2253 xmlChar charref[20];
2254
2255#ifdef DEBUG_ENCODING
2256 xmlGenericError(xmlGenericErrorContext,
2257 "handling output conversion error\n");
2258 xmlGenericError(xmlGenericErrorContext,
2259 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2260 in->content[0], in->content[1],
2261 in->content[2], in->content[3]);
2262#endif
2263 /*
2264 * Removes the UTF8 sequence, and replace it by a charref
2265 * and continue the transcoding phase, hoping the error
2266 * did not mangle the encoder state.
2267 */
Daniel Veillard16698282001-09-14 10:29:27 +00002268 sprintf((char *) charref, "&#%d;", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002269 xmlBufferShrink(in, len);
2270 xmlBufferAddHead(in, charref, -1);
2271
2272 goto retry;
2273 } else {
2274 xmlGenericError(xmlGenericErrorContext,
2275 "output conversion failed due to conv error\n");
2276 xmlGenericError(xmlGenericErrorContext,
2277 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2278 in->content[0], in->content[1],
2279 in->content[2], in->content[3]);
2280 in->content[0] = ' ';
2281 }
2282 break;
2283 }
2284 }
2285 return(ret);
2286}
2287
2288/**
2289 * xmlCharEncCloseFunc:
2290 * @handler: char enconding transformation data structure
2291 *
2292 * Generic front-end for hencoding handler close function
2293 *
2294 * Returns 0 if success, or -1 in case of error
2295 */
2296int
2297xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2298 int ret = 0;
2299 if (handler == NULL) return(-1);
2300 if (handler->name == NULL) return(-1);
2301#ifdef LIBXML_ICONV_ENABLED
2302 /*
2303 * Iconv handlers can be oused only once, free the whole block.
2304 * and the associated icon resources.
2305 */
2306 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2307 if (handler->name != NULL)
2308 xmlFree(handler->name);
2309 handler->name = NULL;
2310 if (handler->iconv_out != NULL) {
2311 if (iconv_close(handler->iconv_out))
2312 ret = -1;
2313 handler->iconv_out = NULL;
2314 }
2315 if (handler->iconv_in != NULL) {
2316 if (iconv_close(handler->iconv_in))
2317 ret = -1;
2318 handler->iconv_in = NULL;
2319 }
2320 xmlFree(handler);
2321 }
2322#endif /* LIBXML_ICONV_ENABLED */
2323#ifdef DEBUG_ENCODING
2324 if (ret)
2325 xmlGenericError(xmlGenericErrorContext,
2326 "failed to close the encoding handler\n");
2327 else
2328 xmlGenericError(xmlGenericErrorContext,
2329 "closed the encoding handler\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002330#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002331
Owen Taylor3473f882001-02-23 17:55:21 +00002332 return(ret);
2333}
2334