blob: 781d8dab6a91f71fc947a0240c7b94a0286b1a93 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
Daniel Veillardc5d64342001-06-24 12:13:24 +000018 * daniel@veillard.com
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Bjorn Reese70a9da52001-04-21 16:57:29 +000026#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000027
Owen Taylor3473f882001-02-23 17:55:21 +000028#include <string.h>
29
30#ifdef HAVE_CTYPE_H
31#include <ctype.h>
32#endif
33#ifdef HAVE_STDLIB_H
34#include <stdlib.h>
35#endif
Owen Taylor3473f882001-02-23 17:55:21 +000036#ifdef LIBXML_ICONV_ENABLED
37#ifdef HAVE_ERRNO_H
38#include <errno.h>
39#endif
40#endif
41#include <libxml/encoding.h>
42#include <libxml/xmlmemory.h>
43#ifdef LIBXML_HTML_ENABLED
44#include <libxml/HTMLparser.h>
45#endif
46#include <libxml/xmlerror.h>
47
Daniel Veillard22090732001-07-16 00:06:07 +000048static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
49static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +000050
51typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
52typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
53struct _xmlCharEncodingAlias {
54 const char *name;
55 const char *alias;
56};
57
58static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
59static int xmlCharEncodingAliasesNb = 0;
60static int xmlCharEncodingAliasesMax = 0;
61
62#ifdef LIBXML_ICONV_ENABLED
63#if 0
64#define DEBUG_ENCODING /* Define this to get encoding traces */
65#endif
66#endif
67
68static int xmlLittleEndian = 1;
69
Daniel Veillard97ac1312001-05-30 19:14:17 +000070/************************************************************************
71 * *
72 * Generic UTF8 handling routines *
73 * *
74 * From rfc2044: encoding of the Unicode values on UTF-8: *
75 * *
76 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
77 * 0000 0000-0000 007F 0xxxxxxx *
78 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
79 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
80 * *
81 * I hope we won't use values > 0xFFFF anytime soon ! *
82 * *
83 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000084
85/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000086 * xmlUTF8Strlen:
87 * @utf: a sequence of UTF-8 encoded bytes
88 *
Daniel Veillard60087f32001-10-10 09:45:09 +000089 * compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillarde043ee12001-04-16 14:08:07 +000090 * checking of the content of the string.
91 *
92 * Returns the number of characters in the string or -1 in case of error
93 */
94int
Daniel Veillard97ac1312001-05-30 19:14:17 +000095xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +000096 int ret = 0;
97
98 if (utf == NULL)
99 return(-1);
100
101 while (*utf != 0) {
102 if (utf[0] & 0x80) {
103 if ((utf[1] & 0xc0) != 0x80)
104 return(-1);
105 if ((utf[0] & 0xe0) == 0xe0) {
106 if ((utf[2] & 0xc0) != 0x80)
107 return(-1);
108 if ((utf[0] & 0xf0) == 0xf0) {
109 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
110 return(-1);
111 utf += 4;
112 } else {
113 utf += 3;
114 }
115 } else {
116 utf += 2;
117 }
118 } else {
119 utf++;
120 }
121 ret++;
122 }
123 return(ret);
124}
125
126/**
Owen Taylor3473f882001-02-23 17:55:21 +0000127 * xmlGetUTF8Char:
128 * @utf: a sequence of UTF-8 encoded bytes
129 * @len: a pointer to @bytes len
130 *
131 * Read one UTF8 Char from @utf
132 *
133 * Returns the char value or -1 in case of error and update @len with the
134 * number of bytes used
135 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000136static int
Owen Taylor3473f882001-02-23 17:55:21 +0000137xmlGetUTF8Char(const unsigned char *utf, int *len) {
138 unsigned int c;
139
140 if (utf == NULL)
141 goto error;
142 if (len == NULL)
143 goto error;
144 if (*len < 1)
145 goto error;
146
147 c = utf[0];
148 if (c & 0x80) {
149 if (*len < 2)
150 goto error;
151 if ((utf[1] & 0xc0) != 0x80)
152 goto error;
153 if ((c & 0xe0) == 0xe0) {
154 if (*len < 3)
155 goto error;
156 if ((utf[2] & 0xc0) != 0x80)
157 goto error;
158 if ((c & 0xf0) == 0xf0) {
159 if (*len < 4)
160 goto error;
161 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
162 goto error;
163 *len = 4;
164 /* 4-byte code */
165 c = (utf[0] & 0x7) << 18;
166 c |= (utf[1] & 0x3f) << 12;
167 c |= (utf[2] & 0x3f) << 6;
168 c |= utf[3] & 0x3f;
169 } else {
170 /* 3-byte code */
171 *len = 3;
172 c = (utf[0] & 0xf) << 12;
173 c |= (utf[1] & 0x3f) << 6;
174 c |= utf[2] & 0x3f;
175 }
176 } else {
177 /* 2-byte code */
178 *len = 2;
179 c = (utf[0] & 0x1f) << 6;
180 c |= utf[1] & 0x3f;
181 }
182 } else {
183 /* 1-byte code */
184 *len = 1;
185 }
186 return(c);
187
188error:
189 *len = 0;
190 return(-1);
191}
192
193/**
194 * xmlCheckUTF8: Check utf-8 string for legality.
195 * @utf: Pointer to putative utf-8 encoded string.
196 *
197 * Checks @utf for being valid utf-8. @utf is assumed to be
198 * null-terminated. This function is not super-strict, as it will
199 * allow longer utf-8 sequences than necessary. Note that Java is
200 * capable of producing these sequences if provoked. Also note, this
201 * routine checks for the 4-byte maxiumum size, but does not check for
202 * 0x10ffff maximum value.
203 *
204 * Return value: true if @utf is valid.
205 **/
206int
207xmlCheckUTF8(const unsigned char *utf)
208{
209 int ix;
210 unsigned char c;
211
212 for (ix = 0; (c = utf[ix]);) {
213 if (c & 0x80) {
214 if ((utf[ix + 1] & 0xc0) != 0x80)
215 return(0);
216 if ((c & 0xe0) == 0xe0) {
217 if ((utf[ix + 2] & 0xc0) != 0x80)
218 return(0);
219 if ((c & 0xf0) == 0xf0) {
220 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
221 return(0);
222 ix += 4;
223 /* 4-byte code */
224 } else
225 /* 3-byte code */
226 ix += 3;
227 } else
228 /* 2-byte code */
229 ix += 2;
230 } else
231 /* 1-byte code */
232 ix++;
233 }
234 return(1);
235}
236
237/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000238 * xmlUTF8Strsize:
239 * @utf: a sequence of UTF-8 encoded bytes
240 * @len: the number of characters in the array
241 *
242 * storage size of an UTF8 string
243 *
244 * Returns the storage size of
245 * the first 'len' characters of ARRAY
246 *
247 */
248
249int
250xmlUTF8Strsize(const xmlChar *utf, int len) {
251 const xmlChar *ptr=utf;
252 xmlChar ch;
253
254 if (len <= 0)
255 return(0);
256
257 while ( len-- > 0) {
258 if ( !*ptr )
259 break;
260 if ( (ch = *ptr++) & 0x80)
261 while ( (ch<<=1) & 0x80 )
262 ptr++;
263 }
264 return (ptr - utf);
265}
266
267
268/**
269 * xmlUTF8Strndup:
270 * @utf: the input UTF8 *
271 * @len: the len of @utf (in chars)
272 *
273 * a strndup for array of UTF8's
274 *
275 * Returns a new UTF8 * or NULL
276 */
277xmlChar *
278xmlUTF8Strndup(const xmlChar *utf, int len) {
279 xmlChar *ret;
280 int i;
281
282 if ((utf == NULL) || (len < 0)) return(NULL);
283 i = xmlUTF8Strsize(utf, len);
284 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
285 if (ret == NULL) {
286 xmlGenericError(xmlGenericErrorContext,
287 "malloc of %ld byte failed\n",
288 (len + 1) * (long)sizeof(xmlChar));
289 return(NULL);
290 }
291 memcpy(ret, utf, i * sizeof(xmlChar));
292 ret[i] = 0;
293 return(ret);
294}
295
296/**
297 * xmlUTF8Strpos:
298 * @utf: the input UTF8 *
299 * @pos: the position of the desired UTF8 char (in chars)
300 *
301 * a function to provide the equivalent of fetching a
302 * character from a string array
303 *
304 * Returns a pointer to the UTF8 character or NULL
305 */
306xmlChar *
307xmlUTF8Strpos(const xmlChar *utf, int pos) {
308 xmlChar ch;
309
310 if (utf == NULL) return(NULL);
311 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
312 return(NULL);
313 while (pos--) {
314 if ((ch=*utf++) == 0) return(NULL);
315 if ( ch & 0x80 ) {
316 /* if not simple ascii, verify proper format */
317 if ( (ch & 0xc0) != 0xc0 )
318 return(NULL);
319 /* then skip over remaining bytes for this char */
320 while ( (ch <<= 1) & 0x80 )
321 if ( (*utf++ & 0xc0) != 0x80 )
322 return(NULL);
323 }
324 }
325 return((xmlChar *)utf);
326}
327
328/**
329 * xmlUTF8Strloc:
330 * @utf: the input UTF8 *
331 * @utfchar: the UTF8 character to be found
332 *
333 * a function to provide relative location of a UTF8 char
334 *
335 * Returns the relative character position of the desired char
336 * or -1 if not found
337 */
338int
339xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
340 int i, size;
341 xmlChar ch;
342
343 if (utf==NULL || utfchar==NULL) return -1;
344 size = xmlUTF8Strsize(utfchar, 1);
345 for(i=0; (ch=*utf) != 0; i++) {
346 if (xmlStrncmp(utf, utfchar, size)==0)
347 return(i);
348 utf++;
349 if ( ch & 0x80 ) {
350 /* if not simple ascii, verify proper format */
351 if ( (ch & 0xc0) != 0xc0 )
352 return(-1);
353 /* then skip over remaining bytes for this char */
354 while ( (ch <<= 1) & 0x80 )
355 if ( (*utf++ & 0xc0) != 0x80 )
356 return(-1);
357 }
358 }
359
360 return(-1);
361}
362/**
363 * xmlUTF8Strsub:
364 * @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard97ac1312001-05-30 19:14:17 +0000365 * @start: relative pos of first char
366 * @len: total number to copy
367 *
368 * Note: positions are given in units of UTF-8 chars
369 *
370 * Returns a pointer to a newly created string
371 * or NULL if any problem
372 */
373
374xmlChar *
375xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
376 int i;
377 xmlChar ch;
378
379 if (utf == NULL) return(NULL);
380 if (start < 0) return(NULL);
381 if (len < 0) return(NULL);
382
383 /*
384 * Skip over any leading chars
385 */
386 for (i = 0;i < start;i++) {
387 if ((ch=*utf++) == 0) return(NULL);
388 if ( ch & 0x80 ) {
389 /* if not simple ascii, verify proper format */
390 if ( (ch & 0xc0) != 0xc0 )
391 return(NULL);
392 /* then skip over remaining bytes for this char */
393 while ( (ch <<= 1) & 0x80 )
394 if ( (*utf++ & 0xc0) != 0x80 )
395 return(NULL);
396 }
397 }
398
399 return(xmlUTF8Strndup(utf, len));
400}
401
402/************************************************************************
403 * *
404 * Conversions To/From UTF8 encoding *
405 * *
406 ************************************************************************/
407
408/**
Owen Taylor3473f882001-02-23 17:55:21 +0000409 * asciiToUTF8:
410 * @out: a pointer to an array of bytes to store the result
411 * @outlen: the length of @out
412 * @in: a pointer to an array of ASCII chars
413 * @inlen: the length of @in
414 *
415 * Take a block of ASCII chars in and try to convert it to an UTF-8
416 * block of chars out.
417 * Returns 0 if success, or -1 otherwise
418 * The value of @inlen after return is the number of octets consumed
419 * as the return value is positive, else unpredictiable.
420 * The value of @outlen after return is the number of ocetes consumed.
421 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000422static int
Owen Taylor3473f882001-02-23 17:55:21 +0000423asciiToUTF8(unsigned char* out, int *outlen,
424 const unsigned char* in, int *inlen) {
425 unsigned char* outstart = out;
426 const unsigned char* base = in;
427 const unsigned char* processed = in;
428 unsigned char* outend = out + *outlen;
429 const unsigned char* inend;
430 unsigned int c;
431 int bits;
432
433 inend = in + (*inlen);
434 while ((in < inend) && (out - outstart + 5 < *outlen)) {
435 c= *in++;
436
437 /* assertion: c is a single UTF-4 value */
438 if (out >= outend)
439 break;
440 if (c < 0x80) { *out++= c; bits= -6; }
441 else {
442 *outlen = out - outstart;
443 *inlen = processed - base;
444 return(-1);
445 }
446
447 for ( ; bits >= 0; bits-= 6) {
448 if (out >= outend)
449 break;
450 *out++= ((c >> bits) & 0x3F) | 0x80;
451 }
452 processed = (const unsigned char*) in;
453 }
454 *outlen = out - outstart;
455 *inlen = processed - base;
456 return(0);
457}
458
459/**
460 * UTF8Toascii:
461 * @out: a pointer to an array of bytes to store the result
462 * @outlen: the length of @out
463 * @in: a pointer to an array of UTF-8 chars
464 * @inlen: the length of @in
465 *
466 * Take a block of UTF-8 chars in and try to convert it to an ASCII
467 * block of chars out.
468 *
469 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
470 * The value of @inlen after return is the number of octets consumed
471 * as the return value is positive, else unpredictiable.
472 * The value of @outlen after return is the number of ocetes consumed.
473 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000474static int
Owen Taylor3473f882001-02-23 17:55:21 +0000475UTF8Toascii(unsigned char* out, int *outlen,
476 const unsigned char* in, int *inlen) {
477 const unsigned char* processed = in;
478 const unsigned char* outend;
479 const unsigned char* outstart = out;
480 const unsigned char* instart = in;
481 const unsigned char* inend;
482 unsigned int c, d;
483 int trailing;
484
485 if (in == NULL) {
486 /*
487 * initialization nothing to do
488 */
489 *outlen = 0;
490 *inlen = 0;
491 return(0);
492 }
493 inend = in + (*inlen);
494 outend = out + (*outlen);
495 while (in < inend) {
496 d = *in++;
497 if (d < 0x80) { c= d; trailing= 0; }
498 else if (d < 0xC0) {
499 /* trailing byte in leading position */
500 *outlen = out - outstart;
501 *inlen = processed - instart;
502 return(-2);
503 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
504 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
505 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
506 else {
507 /* no chance for this in Ascii */
508 *outlen = out - outstart;
509 *inlen = processed - instart;
510 return(-2);
511 }
512
513 if (inend - in < trailing) {
514 break;
515 }
516
517 for ( ; trailing; trailing--) {
518 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
519 break;
520 c <<= 6;
521 c |= d & 0x3F;
522 }
523
524 /* assertion: c is a single UTF-4 value */
525 if (c < 0x80) {
526 if (out >= outend)
527 break;
528 *out++ = c;
529 } else {
530 /* no chance for this in Ascii */
531 *outlen = out - outstart;
532 *inlen = processed - instart;
533 return(-2);
534 }
535 processed = in;
536 }
537 *outlen = out - outstart;
538 *inlen = processed - instart;
539 return(0);
540}
541
542/**
543 * isolat1ToUTF8:
544 * @out: a pointer to an array of bytes to store the result
545 * @outlen: the length of @out
546 * @in: a pointer to an array of ISO Latin 1 chars
547 * @inlen: the length of @in
548 *
549 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
550 * block of chars out.
551 * Returns 0 if success, or -1 otherwise
552 * The value of @inlen after return is the number of octets consumed
553 * as the return value is positive, else unpredictiable.
554 * The value of @outlen after return is the number of ocetes consumed.
555 */
556int
557isolat1ToUTF8(unsigned char* out, int *outlen,
558 const unsigned char* in, int *inlen) {
559 unsigned char* outstart = out;
560 const unsigned char* base = in;
561 const unsigned char* processed = in;
562 unsigned char* outend = out + *outlen;
563 const unsigned char* inend;
564 unsigned int c;
Owen Taylor3473f882001-02-23 17:55:21 +0000565
566 inend = in + (*inlen);
Daniel Veillard02141ea2001-04-30 11:46:40 +0000567 while (in < inend) {
568 c = *in++;
Owen Taylor3473f882001-02-23 17:55:21 +0000569
Owen Taylor3473f882001-02-23 17:55:21 +0000570 if (out >= outend)
571 break;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000572
573 if (c < 0x80) {
574 *out++ = c;
575 processed++;
576 continue;
577 } else {
578 *out++= ((c >> 6) & 0x1F) | 0xC0;
Owen Taylor3473f882001-02-23 17:55:21 +0000579 if (out >= outend)
Daniel Veillard02141ea2001-04-30 11:46:40 +0000580 break;
581 *out++= (c & 0x3F) | 0x80;
582 processed++;
Owen Taylor3473f882001-02-23 17:55:21 +0000583 }
Owen Taylor3473f882001-02-23 17:55:21 +0000584 }
585 *outlen = out - outstart;
586 *inlen = processed - base;
587 return(0);
588}
589
590/**
591 * UTF8Toisolat1:
592 * @out: a pointer to an array of bytes to store the result
593 * @outlen: the length of @out
594 * @in: a pointer to an array of UTF-8 chars
595 * @inlen: the length of @in
596 *
597 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
598 * block of chars out.
599 *
600 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
601 * The value of @inlen after return is the number of octets consumed
602 * as the return value is positive, else unpredictiable.
603 * The value of @outlen after return is the number of ocetes consumed.
604 */
605int
606UTF8Toisolat1(unsigned char* out, int *outlen,
607 const unsigned char* in, int *inlen) {
608 const unsigned char* processed = in;
609 const unsigned char* outend;
610 const unsigned char* outstart = out;
611 const unsigned char* instart = in;
612 const unsigned char* inend;
613 unsigned int c, d;
614 int trailing;
615
616 if (in == NULL) {
617 /*
618 * initialization nothing to do
619 */
620 *outlen = 0;
621 *inlen = 0;
622 return(0);
623 }
624 inend = in + (*inlen);
625 outend = out + (*outlen);
626 while (in < inend) {
627 d = *in++;
628 if (d < 0x80) { c= d; trailing= 0; }
629 else if (d < 0xC0) {
630 /* trailing byte in leading position */
631 *outlen = out - outstart;
632 *inlen = processed - instart;
633 return(-2);
634 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
635 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
636 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
637 else {
638 /* no chance for this in IsoLat1 */
639 *outlen = out - outstart;
640 *inlen = processed - instart;
641 return(-2);
642 }
643
644 if (inend - in < trailing) {
645 break;
646 }
647
648 for ( ; trailing; trailing--) {
649 if (in >= inend)
650 break;
651 if (((d= *in++) & 0xC0) != 0x80) {
652 *outlen = out - outstart;
653 *inlen = processed - instart;
654 return(-2);
655 }
656 c <<= 6;
657 c |= d & 0x3F;
658 }
659
660 /* assertion: c is a single UTF-4 value */
661 if (c <= 0xFF) {
662 if (out >= outend)
663 break;
664 *out++ = c;
665 } else {
666 /* no chance for this in IsoLat1 */
667 *outlen = out - outstart;
668 *inlen = processed - instart;
669 return(-2);
670 }
671 processed = in;
672 }
673 *outlen = out - outstart;
674 *inlen = processed - instart;
675 return(0);
676}
677
678/**
679 * UTF16LEToUTF8:
680 * @out: a pointer to an array of bytes to store the result
681 * @outlen: the length of @out
682 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
683 * @inlenb: the length of @in in UTF-16LE chars
684 *
685 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
686 * block of chars out. This function assume the endian properity
687 * is the same between the native type of this machine and the
688 * inputed one.
689 *
690 * Returns the number of byte written, or -1 by lack of space, or -2
691 * if the transcoding fails (for *in is not valid utf16 string)
692 * The value of *inlen after return is the number of octets consumed
693 * as the return value is positive, else unpredictiable.
694 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000695static int
Owen Taylor3473f882001-02-23 17:55:21 +0000696UTF16LEToUTF8(unsigned char* out, int *outlen,
697 const unsigned char* inb, int *inlenb)
698{
699 unsigned char* outstart = out;
700 const unsigned char* processed = inb;
701 unsigned char* outend = out + *outlen;
702 unsigned short* in = (unsigned short*) inb;
703 unsigned short* inend;
704 unsigned int c, d, inlen;
705 unsigned char *tmp;
706 int bits;
707
708 if ((*inlenb % 2) == 1)
709 (*inlenb)--;
710 inlen = *inlenb / 2;
711 inend = in + inlen;
712 while ((in < inend) && (out - outstart + 5 < *outlen)) {
713 if (xmlLittleEndian) {
714 c= *in++;
715 } else {
716 tmp = (unsigned char *) in;
717 c = *tmp++;
718 c = c | (((unsigned int)*tmp) << 8);
719 in++;
720 }
721 if ((c & 0xFC00) == 0xD800) { /* surrogates */
722 if (in >= inend) { /* (in > inend) shouldn't happens */
723 break;
724 }
725 if (xmlLittleEndian) {
726 d = *in++;
727 } else {
728 tmp = (unsigned char *) in;
729 d = *tmp++;
730 d = d | (((unsigned int)*tmp) << 8);
731 in++;
732 }
733 if ((d & 0xFC00) == 0xDC00) {
734 c &= 0x03FF;
735 c <<= 10;
736 c |= d & 0x03FF;
737 c += 0x10000;
738 }
739 else {
740 *outlen = out - outstart;
741 *inlenb = processed - inb;
742 return(-2);
743 }
744 }
745
746 /* assertion: c is a single UTF-4 value */
747 if (out >= outend)
748 break;
749 if (c < 0x80) { *out++= c; bits= -6; }
750 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
751 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
752 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
753
754 for ( ; bits >= 0; bits-= 6) {
755 if (out >= outend)
756 break;
757 *out++= ((c >> bits) & 0x3F) | 0x80;
758 }
759 processed = (const unsigned char*) in;
760 }
761 *outlen = out - outstart;
762 *inlenb = processed - inb;
763 return(0);
764}
765
766/**
767 * UTF8ToUTF16LE:
768 * @outb: a pointer to an array of bytes to store the result
769 * @outlen: the length of @outb
770 * @in: a pointer to an array of UTF-8 chars
771 * @inlen: the length of @in
772 *
773 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
774 * block of chars out.
775 *
776 * Returns the number of byte written, or -1 by lack of space, or -2
777 * if the transcoding failed.
778 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000779static int
Owen Taylor3473f882001-02-23 17:55:21 +0000780UTF8ToUTF16LE(unsigned char* outb, int *outlen,
781 const unsigned char* in, int *inlen)
782{
783 unsigned short* out = (unsigned short*) outb;
784 const unsigned char* processed = in;
785 unsigned short* outstart= out;
786 unsigned short* outend;
787 const unsigned char* inend= in+*inlen;
788 unsigned int c, d;
789 int trailing;
790 unsigned char *tmp;
791 unsigned short tmp1, tmp2;
792
793 if (in == NULL) {
794 /*
795 * initialization, add the Byte Order Mark
796 */
797 if (*outlen >= 2) {
798 outb[0] = 0xFF;
799 outb[1] = 0xFE;
800 *outlen = 2;
801 *inlen = 0;
802#ifdef DEBUG_ENCODING
803 xmlGenericError(xmlGenericErrorContext,
804 "Added FFFE Byte Order Mark\n");
805#endif
806 return(2);
807 }
808 *outlen = 0;
809 *inlen = 0;
810 return(0);
811 }
812 outend = out + (*outlen / 2);
813 while (in < inend) {
814 d= *in++;
815 if (d < 0x80) { c= d; trailing= 0; }
816 else if (d < 0xC0) {
817 /* trailing byte in leading position */
818 *outlen = (out - outstart) * 2;
819 *inlen = processed - in;
820 return(-2);
821 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
822 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
823 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
824 else {
825 /* no chance for this in UTF-16 */
826 *outlen = (out - outstart) * 2;
827 *inlen = processed - in;
828 return(-2);
829 }
830
831 if (inend - in < trailing) {
832 break;
833 }
834
835 for ( ; trailing; trailing--) {
836 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
837 break;
838 c <<= 6;
839 c |= d & 0x3F;
840 }
841
842 /* assertion: c is a single UTF-4 value */
843 if (c < 0x10000) {
844 if (out >= outend)
845 break;
846 if (xmlLittleEndian) {
847 *out++ = c;
848 } else {
849 tmp = (unsigned char *) out;
850 *tmp = c ;
851 *(tmp + 1) = c >> 8 ;
852 out++;
853 }
854 }
855 else if (c < 0x110000) {
856 if (out+1 >= outend)
857 break;
858 c -= 0x10000;
859 if (xmlLittleEndian) {
860 *out++ = 0xD800 | (c >> 10);
861 *out++ = 0xDC00 | (c & 0x03FF);
862 } else {
863 tmp1 = 0xD800 | (c >> 10);
864 tmp = (unsigned char *) out;
865 *tmp = (unsigned char) tmp1;
866 *(tmp + 1) = tmp1 >> 8;
867 out++;
868
869 tmp2 = 0xDC00 | (c & 0x03FF);
870 tmp = (unsigned char *) out;
871 *tmp = (unsigned char) tmp2;
872 *(tmp + 1) = tmp2 >> 8;
873 out++;
874 }
875 }
876 else
877 break;
878 processed = in;
879 }
880 *outlen = (out - outstart) * 2;
881 *inlen = processed - in;
882 return(0);
883}
884
885/**
886 * UTF16BEToUTF8:
887 * @out: a pointer to an array of bytes to store the result
888 * @outlen: the length of @out
889 * @inb: a pointer to an array of UTF-16 passwd as a byte array
890 * @inlenb: the length of @in in UTF-16 chars
891 *
892 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
893 * block of chars out. This function assume the endian properity
894 * is the same between the native type of this machine and the
895 * inputed one.
896 *
897 * Returns the number of byte written, or -1 by lack of space, or -2
898 * if the transcoding fails (for *in is not valid utf16 string)
899 * The value of *inlen after return is the number of octets consumed
900 * as the return value is positive, else unpredictiable.
901 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000902static int
Owen Taylor3473f882001-02-23 17:55:21 +0000903UTF16BEToUTF8(unsigned char* out, int *outlen,
904 const unsigned char* inb, int *inlenb)
905{
906 unsigned char* outstart = out;
907 const unsigned char* processed = inb;
908 unsigned char* outend = out + *outlen;
909 unsigned short* in = (unsigned short*) inb;
910 unsigned short* inend;
911 unsigned int c, d, inlen;
912 unsigned char *tmp;
913 int bits;
914
915 if ((*inlenb % 2) == 1)
916 (*inlenb)--;
917 inlen = *inlenb / 2;
918 inend= in + inlen;
919 while (in < inend) {
920 if (xmlLittleEndian) {
921 tmp = (unsigned char *) in;
922 c = *tmp++;
923 c = c << 8;
924 c = c | (unsigned int) *tmp;
925 in++;
926 } else {
927 c= *in++;
928 }
929 if ((c & 0xFC00) == 0xD800) { /* surrogates */
930 if (in >= inend) { /* (in > inend) shouldn't happens */
931 *outlen = out - outstart;
932 *inlenb = processed - inb;
933 return(-2);
934 }
935 if (xmlLittleEndian) {
936 tmp = (unsigned char *) in;
937 d = *tmp++;
938 d = d << 8;
939 d = d | (unsigned int) *tmp;
940 in++;
941 } else {
942 d= *in++;
943 }
944 if ((d & 0xFC00) == 0xDC00) {
945 c &= 0x03FF;
946 c <<= 10;
947 c |= d & 0x03FF;
948 c += 0x10000;
949 }
950 else {
951 *outlen = out - outstart;
952 *inlenb = processed - inb;
953 return(-2);
954 }
955 }
956
957 /* assertion: c is a single UTF-4 value */
958 if (out >= outend)
959 break;
960 if (c < 0x80) { *out++= c; bits= -6; }
961 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
962 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
963 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
964
965 for ( ; bits >= 0; bits-= 6) {
966 if (out >= outend)
967 break;
968 *out++= ((c >> bits) & 0x3F) | 0x80;
969 }
970 processed = (const unsigned char*) in;
971 }
972 *outlen = out - outstart;
973 *inlenb = processed - inb;
974 return(0);
975}
976
977/**
978 * UTF8ToUTF16BE:
979 * @outb: a pointer to an array of bytes to store the result
980 * @outlen: the length of @outb
981 * @in: a pointer to an array of UTF-8 chars
982 * @inlen: the length of @in
983 *
984 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
985 * block of chars out.
986 *
987 * Returns the number of byte written, or -1 by lack of space, or -2
988 * if the transcoding failed.
989 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000990static int
Owen Taylor3473f882001-02-23 17:55:21 +0000991UTF8ToUTF16BE(unsigned char* outb, int *outlen,
992 const unsigned char* in, int *inlen)
993{
994 unsigned short* out = (unsigned short*) outb;
995 const unsigned char* processed = in;
996 unsigned short* outstart= out;
997 unsigned short* outend;
998 const unsigned char* inend= in+*inlen;
999 unsigned int c, d;
1000 int trailing;
1001 unsigned char *tmp;
1002 unsigned short tmp1, tmp2;
1003
1004 if (in == NULL) {
1005 /*
1006 * initialization, add the Byte Order Mark
1007 */
1008 if (*outlen >= 2) {
1009 outb[0] = 0xFE;
1010 outb[1] = 0xFF;
1011 *outlen = 2;
1012 *inlen = 0;
1013#ifdef DEBUG_ENCODING
1014 xmlGenericError(xmlGenericErrorContext,
1015 "Added FEFF Byte Order Mark\n");
1016#endif
1017 return(2);
1018 }
1019 *outlen = 0;
1020 *inlen = 0;
1021 return(0);
1022 }
1023 outend = out + (*outlen / 2);
1024 while (in < inend) {
1025 d= *in++;
1026 if (d < 0x80) { c= d; trailing= 0; }
1027 else if (d < 0xC0) {
1028 /* trailing byte in leading position */
1029 *outlen = out - outstart;
1030 *inlen = processed - in;
1031 return(-2);
1032 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1033 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1034 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1035 else {
1036 /* no chance for this in UTF-16 */
1037 *outlen = out - outstart;
1038 *inlen = processed - in;
1039 return(-2);
1040 }
1041
1042 if (inend - in < trailing) {
1043 break;
1044 }
1045
1046 for ( ; trailing; trailing--) {
1047 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1048 c <<= 6;
1049 c |= d & 0x3F;
1050 }
1051
1052 /* assertion: c is a single UTF-4 value */
1053 if (c < 0x10000) {
1054 if (out >= outend) break;
1055 if (xmlLittleEndian) {
1056 tmp = (unsigned char *) out;
1057 *tmp = c >> 8;
1058 *(tmp + 1) = c;
1059 out++;
1060 } else {
1061 *out++ = c;
1062 }
1063 }
1064 else if (c < 0x110000) {
1065 if (out+1 >= outend) break;
1066 c -= 0x10000;
1067 if (xmlLittleEndian) {
1068 tmp1 = 0xD800 | (c >> 10);
1069 tmp = (unsigned char *) out;
1070 *tmp = tmp1 >> 8;
1071 *(tmp + 1) = (unsigned char) tmp1;
1072 out++;
1073
1074 tmp2 = 0xDC00 | (c & 0x03FF);
1075 tmp = (unsigned char *) out;
1076 *tmp = tmp2 >> 8;
1077 *(tmp + 1) = (unsigned char) tmp2;
1078 out++;
1079 } else {
1080 *out++ = 0xD800 | (c >> 10);
1081 *out++ = 0xDC00 | (c & 0x03FF);
1082 }
1083 }
1084 else
1085 break;
1086 processed = in;
1087 }
1088 *outlen = (out - outstart) * 2;
1089 *inlen = processed - in;
1090 return(0);
1091}
1092
Daniel Veillard97ac1312001-05-30 19:14:17 +00001093/************************************************************************
1094 * *
1095 * Generic encoding handling routines *
1096 * *
1097 ************************************************************************/
1098
Owen Taylor3473f882001-02-23 17:55:21 +00001099/**
1100 * xmlDetectCharEncoding:
1101 * @in: a pointer to the first bytes of the XML entity, must be at least
1102 * 4 bytes long.
1103 * @len: pointer to the length of the buffer
1104 *
1105 * Guess the encoding of the entity using the first bytes of the entity content
1106 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1107 *
1108 * Returns one of the XML_CHAR_ENCODING_... values.
1109 */
1110xmlCharEncoding
1111xmlDetectCharEncoding(const unsigned char* in, int len)
1112{
1113 if (len >= 4) {
1114 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1115 (in[2] == 0x00) && (in[3] == 0x3C))
1116 return(XML_CHAR_ENCODING_UCS4BE);
1117 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1118 (in[2] == 0x00) && (in[3] == 0x00))
1119 return(XML_CHAR_ENCODING_UCS4LE);
1120 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1121 (in[2] == 0x3C) && (in[3] == 0x00))
1122 return(XML_CHAR_ENCODING_UCS4_2143);
1123 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1124 (in[2] == 0x00) && (in[3] == 0x00))
1125 return(XML_CHAR_ENCODING_UCS4_3412);
1126 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1127 (in[2] == 0xA7) && (in[3] == 0x94))
1128 return(XML_CHAR_ENCODING_EBCDIC);
1129 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1130 (in[2] == 0x78) && (in[3] == 0x6D))
1131 return(XML_CHAR_ENCODING_UTF8);
1132 }
Daniel Veillard87a764e2001-06-20 17:41:10 +00001133 if (len >= 3) {
1134 /*
1135 * Errata on XML-1.0 June 20 2001
1136 * We now allow an UTF8 encoded BOM
1137 */
1138 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1139 (in[2] == 0xBF))
1140 return(XML_CHAR_ENCODING_UTF8);
1141 }
Owen Taylor3473f882001-02-23 17:55:21 +00001142 if (len >= 2) {
1143 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1144 return(XML_CHAR_ENCODING_UTF16BE);
1145 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1146 return(XML_CHAR_ENCODING_UTF16LE);
1147 }
1148 return(XML_CHAR_ENCODING_NONE);
1149}
1150
1151/**
1152 * xmlCleanupEncodingAliases:
1153 *
1154 * Unregisters all aliases
1155 */
1156void
1157xmlCleanupEncodingAliases(void) {
1158 int i;
1159
1160 if (xmlCharEncodingAliases == NULL)
1161 return;
1162
1163 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1164 if (xmlCharEncodingAliases[i].name != NULL)
1165 xmlFree((char *) xmlCharEncodingAliases[i].name);
1166 if (xmlCharEncodingAliases[i].alias != NULL)
1167 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1168 }
1169 xmlCharEncodingAliasesNb = 0;
1170 xmlCharEncodingAliasesMax = 0;
1171 xmlFree(xmlCharEncodingAliases);
1172}
1173
1174/**
1175 * xmlGetEncodingAlias:
1176 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1177 *
1178 * Lookup an encoding name for the given alias.
1179 *
1180 * Returns NULL if not found the original name otherwise
1181 */
1182const char *
1183xmlGetEncodingAlias(const char *alias) {
1184 int i;
1185 char upper[100];
1186
1187 if (alias == NULL)
1188 return(NULL);
1189
1190 if (xmlCharEncodingAliases == NULL)
1191 return(NULL);
1192
1193 for (i = 0;i < 99;i++) {
1194 upper[i] = toupper(alias[i]);
1195 if (upper[i] == 0) break;
1196 }
1197 upper[i] = 0;
1198
1199 /*
1200 * Walk down the list looking for a definition of the alias
1201 */
1202 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1203 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1204 return(xmlCharEncodingAliases[i].name);
1205 }
1206 }
1207 return(NULL);
1208}
1209
1210/**
1211 * xmlAddEncodingAlias:
1212 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1213 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1214 *
1215 * Registers and alias @alias for an encoding named @name. Existing alias
1216 * will be overwritten.
1217 *
1218 * Returns 0 in case of success, -1 in case of error
1219 */
1220int
1221xmlAddEncodingAlias(const char *name, const char *alias) {
1222 int i;
1223 char upper[100];
1224
1225 if ((name == NULL) || (alias == NULL))
1226 return(-1);
1227
1228 for (i = 0;i < 99;i++) {
1229 upper[i] = toupper(alias[i]);
1230 if (upper[i] == 0) break;
1231 }
1232 upper[i] = 0;
1233
1234 if (xmlCharEncodingAliases == NULL) {
1235 xmlCharEncodingAliasesNb = 0;
1236 xmlCharEncodingAliasesMax = 20;
1237 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1238 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1239 if (xmlCharEncodingAliases == NULL)
1240 return(-1);
1241 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1242 xmlCharEncodingAliasesMax *= 2;
1243 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1244 xmlRealloc(xmlCharEncodingAliases,
1245 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1246 }
1247 /*
1248 * Walk down the list looking for a definition of the alias
1249 */
1250 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1251 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1252 /*
1253 * Replace the definition.
1254 */
1255 xmlFree((char *) xmlCharEncodingAliases[i].name);
1256 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1257 return(0);
1258 }
1259 }
1260 /*
1261 * Add the definition
1262 */
1263 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1264 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1265 xmlCharEncodingAliasesNb++;
1266 return(0);
1267}
1268
1269/**
1270 * xmlDelEncodingAlias:
1271 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1272 *
1273 * Unregisters an encoding alias @alias
1274 *
1275 * Returns 0 in case of success, -1 in case of error
1276 */
1277int
1278xmlDelEncodingAlias(const char *alias) {
1279 int i;
1280
1281 if (alias == NULL)
1282 return(-1);
1283
1284 if (xmlCharEncodingAliases == NULL)
1285 return(-1);
1286 /*
1287 * Walk down the list looking for a definition of the alias
1288 */
1289 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1290 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1291 xmlFree((char *) xmlCharEncodingAliases[i].name);
1292 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1293 xmlCharEncodingAliasesNb--;
1294 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1295 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1296 return(0);
1297 }
1298 }
1299 return(-1);
1300}
1301
1302/**
1303 * xmlParseCharEncoding:
1304 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1305 *
1306 * Conpare the string to the known encoding schemes already known. Note
1307 * that the comparison is case insensitive accordingly to the section
1308 * [XML] 4.3.3 Character Encoding in Entities.
1309 *
1310 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1311 * if not recognized.
1312 */
1313xmlCharEncoding
1314xmlParseCharEncoding(const char* name)
1315{
1316 const char *alias;
1317 char upper[500];
1318 int i;
1319
1320 if (name == NULL)
1321 return(XML_CHAR_ENCODING_NONE);
1322
1323 /*
1324 * Do the alias resolution
1325 */
1326 alias = xmlGetEncodingAlias(name);
1327 if (alias != NULL)
1328 name = alias;
1329
1330 for (i = 0;i < 499;i++) {
1331 upper[i] = toupper(name[i]);
1332 if (upper[i] == 0) break;
1333 }
1334 upper[i] = 0;
1335
1336 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1337 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1338 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1339
1340 /*
1341 * NOTE: if we were able to parse this, the endianness of UTF16 is
1342 * already found and in use
1343 */
1344 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1345 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1346
1347 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1348 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1349 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1350
1351 /*
1352 * NOTE: if we were able to parse this, the endianness of UCS4 is
1353 * already found and in use
1354 */
1355 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1356 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1357 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1358
1359
1360 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1361 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1362 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1363
1364 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1365 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1366 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1367
1368 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1369 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1370 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1371 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1372 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1373 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1374 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1375
1376 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1377 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1378 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1379
1380#ifdef DEBUG_ENCODING
1381 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1382#endif
1383 return(XML_CHAR_ENCODING_ERROR);
1384}
1385
1386/**
1387 * xmlGetCharEncodingName:
1388 * @enc: the encoding
1389 *
1390 * The "canonical" name for XML encoding.
1391 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1392 * Section 4.3.3 Character Encoding in Entities
1393 *
1394 * Returns the canonical name for the given encoding
1395 */
1396
1397const char*
1398xmlGetCharEncodingName(xmlCharEncoding enc) {
1399 switch (enc) {
1400 case XML_CHAR_ENCODING_ERROR:
1401 return(NULL);
1402 case XML_CHAR_ENCODING_NONE:
1403 return(NULL);
1404 case XML_CHAR_ENCODING_UTF8:
1405 return("UTF-8");
1406 case XML_CHAR_ENCODING_UTF16LE:
1407 return("UTF-16");
1408 case XML_CHAR_ENCODING_UTF16BE:
1409 return("UTF-16");
1410 case XML_CHAR_ENCODING_EBCDIC:
1411 return("EBCDIC");
1412 case XML_CHAR_ENCODING_UCS4LE:
1413 return("ISO-10646-UCS-4");
1414 case XML_CHAR_ENCODING_UCS4BE:
1415 return("ISO-10646-UCS-4");
1416 case XML_CHAR_ENCODING_UCS4_2143:
1417 return("ISO-10646-UCS-4");
1418 case XML_CHAR_ENCODING_UCS4_3412:
1419 return("ISO-10646-UCS-4");
1420 case XML_CHAR_ENCODING_UCS2:
1421 return("ISO-10646-UCS-2");
1422 case XML_CHAR_ENCODING_8859_1:
1423 return("ISO-8859-1");
1424 case XML_CHAR_ENCODING_8859_2:
1425 return("ISO-8859-2");
1426 case XML_CHAR_ENCODING_8859_3:
1427 return("ISO-8859-3");
1428 case XML_CHAR_ENCODING_8859_4:
1429 return("ISO-8859-4");
1430 case XML_CHAR_ENCODING_8859_5:
1431 return("ISO-8859-5");
1432 case XML_CHAR_ENCODING_8859_6:
1433 return("ISO-8859-6");
1434 case XML_CHAR_ENCODING_8859_7:
1435 return("ISO-8859-7");
1436 case XML_CHAR_ENCODING_8859_8:
1437 return("ISO-8859-8");
1438 case XML_CHAR_ENCODING_8859_9:
1439 return("ISO-8859-9");
1440 case XML_CHAR_ENCODING_2022_JP:
1441 return("ISO-2022-JP");
1442 case XML_CHAR_ENCODING_SHIFT_JIS:
1443 return("Shift-JIS");
1444 case XML_CHAR_ENCODING_EUC_JP:
1445 return("EUC-JP");
1446 case XML_CHAR_ENCODING_ASCII:
1447 return(NULL);
1448 }
1449 return(NULL);
1450}
1451
Daniel Veillard97ac1312001-05-30 19:14:17 +00001452/************************************************************************
1453 * *
1454 * Char encoding handlers *
1455 * *
1456 ************************************************************************/
1457
Owen Taylor3473f882001-02-23 17:55:21 +00001458
1459/* the size should be growable, but it's not a big deal ... */
1460#define MAX_ENCODING_HANDLERS 50
1461static xmlCharEncodingHandlerPtr *handlers = NULL;
1462static int nbCharEncodingHandler = 0;
1463
1464/*
1465 * The default is UTF-8 for XML, that's also the default used for the
1466 * parser internals, so the default encoding handler is NULL
1467 */
1468
1469static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1470
1471/**
1472 * xmlNewCharEncodingHandler:
1473 * @name: the encoding name, in UTF-8 format (ASCII actually)
1474 * @input: the xmlCharEncodingInputFunc to read that encoding
1475 * @output: the xmlCharEncodingOutputFunc to write that encoding
1476 *
1477 * Create and registers an xmlCharEncodingHandler.
1478 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1479 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001480static xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001481xmlNewCharEncodingHandler(const char *name,
1482 xmlCharEncodingInputFunc input,
1483 xmlCharEncodingOutputFunc output) {
1484 xmlCharEncodingHandlerPtr handler;
1485 const char *alias;
1486 char upper[500];
1487 int i;
1488 char *up = 0;
1489
1490 /*
1491 * Do the alias resolution
1492 */
1493 alias = xmlGetEncodingAlias(name);
1494 if (alias != NULL)
1495 name = alias;
1496
1497 /*
1498 * Keep only the uppercase version of the encoding.
1499 */
1500 if (name == NULL) {
1501 xmlGenericError(xmlGenericErrorContext,
1502 "xmlNewCharEncodingHandler : no name !\n");
1503 return(NULL);
1504 }
1505 for (i = 0;i < 499;i++) {
1506 upper[i] = toupper(name[i]);
1507 if (upper[i] == 0) break;
1508 }
1509 upper[i] = 0;
1510 up = xmlMemStrdup(upper);
1511 if (up == NULL) {
1512 xmlGenericError(xmlGenericErrorContext,
1513 "xmlNewCharEncodingHandler : out of memory !\n");
1514 return(NULL);
1515 }
1516
1517 /*
1518 * allocate and fill-up an handler block.
1519 */
1520 handler = (xmlCharEncodingHandlerPtr)
1521 xmlMalloc(sizeof(xmlCharEncodingHandler));
1522 if (handler == NULL) {
1523 xmlGenericError(xmlGenericErrorContext,
1524 "xmlNewCharEncodingHandler : out of memory !\n");
1525 return(NULL);
1526 }
1527 handler->input = input;
1528 handler->output = output;
1529 handler->name = up;
1530
1531#ifdef LIBXML_ICONV_ENABLED
1532 handler->iconv_in = NULL;
1533 handler->iconv_out = NULL;
1534#endif /* LIBXML_ICONV_ENABLED */
1535
1536 /*
1537 * registers and returns the handler.
1538 */
1539 xmlRegisterCharEncodingHandler(handler);
1540#ifdef DEBUG_ENCODING
1541 xmlGenericError(xmlGenericErrorContext,
1542 "Registered encoding handler for %s\n", name);
1543#endif
1544 return(handler);
1545}
1546
1547/**
1548 * xmlInitCharEncodingHandlers:
1549 *
1550 * Initialize the char encoding support, it registers the default
1551 * encoding supported.
1552 * NOTE: while public, this function usually doesn't need to be called
1553 * in normal processing.
1554 */
1555void
1556xmlInitCharEncodingHandlers(void) {
1557 unsigned short int tst = 0x1234;
1558 unsigned char *ptr = (unsigned char *) &tst;
1559
1560 if (handlers != NULL) return;
1561
1562 handlers = (xmlCharEncodingHandlerPtr *)
1563 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1564
1565 if (*ptr == 0x12) xmlLittleEndian = 0;
1566 else if (*ptr == 0x34) xmlLittleEndian = 1;
1567 else xmlGenericError(xmlGenericErrorContext,
1568 "Odd problem at endianness detection\n");
1569
1570 if (handlers == NULL) {
1571 xmlGenericError(xmlGenericErrorContext,
1572 "xmlInitCharEncodingHandlers : out of memory !\n");
1573 return;
1574 }
1575 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1576 xmlUTF16LEHandler =
1577 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1578 xmlUTF16BEHandler =
1579 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1580 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1581 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001582 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001583#ifdef LIBXML_HTML_ENABLED
1584 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1585#endif
1586}
1587
1588/**
1589 * xmlCleanupCharEncodingHandlers:
1590 *
1591 * Cleanup the memory allocated for the char encoding support, it
1592 * unregisters all the encoding handlers and the aliases.
1593 */
1594void
1595xmlCleanupCharEncodingHandlers(void) {
1596 xmlCleanupEncodingAliases();
1597
1598 if (handlers == NULL) return;
1599
1600 for (;nbCharEncodingHandler > 0;) {
1601 nbCharEncodingHandler--;
1602 if (handlers[nbCharEncodingHandler] != NULL) {
1603 if (handlers[nbCharEncodingHandler]->name != NULL)
1604 xmlFree(handlers[nbCharEncodingHandler]->name);
1605 xmlFree(handlers[nbCharEncodingHandler]);
1606 }
1607 }
1608 xmlFree(handlers);
1609 handlers = NULL;
1610 nbCharEncodingHandler = 0;
1611 xmlDefaultCharEncodingHandler = NULL;
1612}
1613
1614/**
1615 * xmlRegisterCharEncodingHandler:
1616 * @handler: the xmlCharEncodingHandlerPtr handler block
1617 *
1618 * Register the char encoding handler, surprizing, isn't it ?
1619 */
1620void
1621xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1622 if (handlers == NULL) xmlInitCharEncodingHandlers();
1623 if (handler == NULL) {
1624 xmlGenericError(xmlGenericErrorContext,
1625 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1626 return;
1627 }
1628
1629 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1630 xmlGenericError(xmlGenericErrorContext,
1631 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1632 xmlGenericError(xmlGenericErrorContext,
1633 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1634 return;
1635 }
1636 handlers[nbCharEncodingHandler++] = handler;
1637}
1638
1639/**
1640 * xmlGetCharEncodingHandler:
1641 * @enc: an xmlCharEncoding value.
1642 *
1643 * Search in the registrered set the handler able to read/write that encoding.
1644 *
1645 * Returns the handler or NULL if not found
1646 */
1647xmlCharEncodingHandlerPtr
1648xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1649 xmlCharEncodingHandlerPtr handler;
1650
1651 if (handlers == NULL) xmlInitCharEncodingHandlers();
1652 switch (enc) {
1653 case XML_CHAR_ENCODING_ERROR:
1654 return(NULL);
1655 case XML_CHAR_ENCODING_NONE:
1656 return(NULL);
1657 case XML_CHAR_ENCODING_UTF8:
1658 return(NULL);
1659 case XML_CHAR_ENCODING_UTF16LE:
1660 return(xmlUTF16LEHandler);
1661 case XML_CHAR_ENCODING_UTF16BE:
1662 return(xmlUTF16BEHandler);
1663 case XML_CHAR_ENCODING_EBCDIC:
1664 handler = xmlFindCharEncodingHandler("EBCDIC");
1665 if (handler != NULL) return(handler);
1666 handler = xmlFindCharEncodingHandler("ebcdic");
1667 if (handler != NULL) return(handler);
1668 break;
1669 case XML_CHAR_ENCODING_UCS4BE:
1670 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1671 if (handler != NULL) return(handler);
1672 handler = xmlFindCharEncodingHandler("UCS-4");
1673 if (handler != NULL) return(handler);
1674 handler = xmlFindCharEncodingHandler("UCS4");
1675 if (handler != NULL) return(handler);
1676 break;
1677 case XML_CHAR_ENCODING_UCS4LE:
1678 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1679 if (handler != NULL) return(handler);
1680 handler = xmlFindCharEncodingHandler("UCS-4");
1681 if (handler != NULL) return(handler);
1682 handler = xmlFindCharEncodingHandler("UCS4");
1683 if (handler != NULL) return(handler);
1684 break;
1685 case XML_CHAR_ENCODING_UCS4_2143:
1686 break;
1687 case XML_CHAR_ENCODING_UCS4_3412:
1688 break;
1689 case XML_CHAR_ENCODING_UCS2:
1690 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1691 if (handler != NULL) return(handler);
1692 handler = xmlFindCharEncodingHandler("UCS-2");
1693 if (handler != NULL) return(handler);
1694 handler = xmlFindCharEncodingHandler("UCS2");
1695 if (handler != NULL) return(handler);
1696 break;
1697
1698 /*
1699 * We used to keep ISO Latin encodings native in the
1700 * generated data. This led to so many problems that
1701 * this has been removed. One can still change this
1702 * back by registering no-ops encoders for those
1703 */
1704 case XML_CHAR_ENCODING_8859_1:
1705 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1706 if (handler != NULL) return(handler);
1707 break;
1708 case XML_CHAR_ENCODING_8859_2:
1709 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1710 if (handler != NULL) return(handler);
1711 break;
1712 case XML_CHAR_ENCODING_8859_3:
1713 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1714 if (handler != NULL) return(handler);
1715 break;
1716 case XML_CHAR_ENCODING_8859_4:
1717 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1718 if (handler != NULL) return(handler);
1719 break;
1720 case XML_CHAR_ENCODING_8859_5:
1721 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1722 if (handler != NULL) return(handler);
1723 break;
1724 case XML_CHAR_ENCODING_8859_6:
1725 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1726 if (handler != NULL) return(handler);
1727 break;
1728 case XML_CHAR_ENCODING_8859_7:
1729 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1730 if (handler != NULL) return(handler);
1731 break;
1732 case XML_CHAR_ENCODING_8859_8:
1733 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1734 if (handler != NULL) return(handler);
1735 break;
1736 case XML_CHAR_ENCODING_8859_9:
1737 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1738 if (handler != NULL) return(handler);
1739 break;
1740
1741
1742 case XML_CHAR_ENCODING_2022_JP:
1743 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1744 if (handler != NULL) return(handler);
1745 break;
1746 case XML_CHAR_ENCODING_SHIFT_JIS:
1747 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1748 if (handler != NULL) return(handler);
1749 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1750 if (handler != NULL) return(handler);
1751 handler = xmlFindCharEncodingHandler("Shift_JIS");
1752 if (handler != NULL) return(handler);
1753 break;
1754 case XML_CHAR_ENCODING_EUC_JP:
1755 handler = xmlFindCharEncodingHandler("EUC-JP");
1756 if (handler != NULL) return(handler);
1757 break;
1758 default:
1759 break;
1760 }
1761
1762#ifdef DEBUG_ENCODING
1763 xmlGenericError(xmlGenericErrorContext,
1764 "No handler found for encoding %d\n", enc);
1765#endif
1766 return(NULL);
1767}
1768
1769/**
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001770 * xmlFindCharEncodingHandler:
1771 * @name: a string describing the char encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001772 *
1773 * Search in the registrered set the handler able to read/write that encoding.
1774 *
1775 * Returns the handler or NULL if not found
1776 */
1777xmlCharEncodingHandlerPtr
1778xmlFindCharEncodingHandler(const char *name) {
1779 const char *nalias;
1780 const char *norig;
1781 xmlCharEncoding alias;
1782#ifdef LIBXML_ICONV_ENABLED
1783 xmlCharEncodingHandlerPtr enc;
1784 iconv_t icv_in, icv_out;
1785#endif /* LIBXML_ICONV_ENABLED */
1786 char upper[100];
1787 int i;
1788
1789 if (handlers == NULL) xmlInitCharEncodingHandlers();
1790 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1791 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1792
1793 /*
1794 * Do the alias resolution
1795 */
1796 norig = name;
1797 nalias = xmlGetEncodingAlias(name);
1798 if (nalias != NULL)
1799 name = nalias;
1800
1801 /*
1802 * Check first for directly registered encoding names
1803 */
1804 for (i = 0;i < 99;i++) {
1805 upper[i] = toupper(name[i]);
1806 if (upper[i] == 0) break;
1807 }
1808 upper[i] = 0;
1809
1810 for (i = 0;i < nbCharEncodingHandler; i++)
1811 if (!strcmp(upper, handlers[i]->name)) {
1812#ifdef DEBUG_ENCODING
1813 xmlGenericError(xmlGenericErrorContext,
1814 "Found registered handler for encoding %s\n", name);
1815#endif
1816 return(handlers[i]);
1817 }
1818
1819#ifdef LIBXML_ICONV_ENABLED
1820 /* check whether iconv can handle this */
1821 icv_in = iconv_open("UTF-8", name);
1822 icv_out = iconv_open(name, "UTF-8");
1823 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1824 enc = (xmlCharEncodingHandlerPtr)
1825 xmlMalloc(sizeof(xmlCharEncodingHandler));
1826 if (enc == NULL) {
1827 iconv_close(icv_in);
1828 iconv_close(icv_out);
1829 return(NULL);
1830 }
1831 enc->name = xmlMemStrdup(name);
1832 enc->input = NULL;
1833 enc->output = NULL;
1834 enc->iconv_in = icv_in;
1835 enc->iconv_out = icv_out;
1836#ifdef DEBUG_ENCODING
1837 xmlGenericError(xmlGenericErrorContext,
1838 "Found iconv handler for encoding %s\n", name);
1839#endif
1840 return enc;
1841 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1842 xmlGenericError(xmlGenericErrorContext,
1843 "iconv : problems with filters for '%s'\n", name);
1844 }
1845#endif /* LIBXML_ICONV_ENABLED */
1846
1847#ifdef DEBUG_ENCODING
1848 xmlGenericError(xmlGenericErrorContext,
1849 "No handler found for encoding %s\n", name);
1850#endif
1851
1852 /*
1853 * Fallback using the canonical names
1854 */
1855 alias = xmlParseCharEncoding(norig);
1856 if (alias != XML_CHAR_ENCODING_ERROR) {
1857 const char* canon;
1858 canon = xmlGetCharEncodingName(alias);
1859 if ((canon != NULL) && (strcmp(name, canon))) {
1860 return(xmlFindCharEncodingHandler(canon));
1861 }
1862 }
1863
1864 return(NULL);
1865}
1866
Daniel Veillard97ac1312001-05-30 19:14:17 +00001867/************************************************************************
1868 * *
1869 * ICONV based generic conversion functions *
1870 * *
1871 ************************************************************************/
1872
Owen Taylor3473f882001-02-23 17:55:21 +00001873#ifdef LIBXML_ICONV_ENABLED
1874/**
1875 * xmlIconvWrapper:
1876 * @cd: iconv converter data structure
1877 * @out: a pointer to an array of bytes to store the result
1878 * @outlen: the length of @out
1879 * @in: a pointer to an array of ISO Latin 1 chars
1880 * @inlen: the length of @in
1881 *
1882 * Returns 0 if success, or
1883 * -1 by lack of space, or
1884 * -2 if the transcoding fails (for *in is not valid utf8 string or
1885 * the result of transformation can't fit into the encoding we want), or
1886 * -3 if there the last byte can't form a single output char.
1887 *
1888 * The value of @inlen after return is the number of octets consumed
1889 * as the return value is positive, else unpredictiable.
1890 * The value of @outlen after return is the number of ocetes consumed.
1891 */
1892static int
1893xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001894 unsigned char *out, int *outlen,
1895 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001896
Daniel Veillard9403a042001-05-28 11:00:53 +00001897 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1898 const char *icv_in = (const char *) in;
1899 char *icv_out = (char *) out;
1900 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001901
Darin Adler699613b2001-07-27 22:47:14 +00001902 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard9403a042001-05-28 11:00:53 +00001903 if (in != NULL) {
1904 *inlen -= icv_inlen;
1905 *outlen -= icv_outlen;
1906 } else {
1907 *inlen = 0;
1908 *outlen = 0;
1909 }
1910 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001911#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00001912 if (errno == EILSEQ) {
1913 return -2;
1914 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001915#endif
1916#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00001917 if (errno == E2BIG) {
1918 return -1;
1919 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001920#endif
1921#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00001922 if (errno == EINVAL) {
1923 return -3;
1924 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001925#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00001926 {
1927 return -3;
1928 }
1929 }
1930 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001931}
1932#endif /* LIBXML_ICONV_ENABLED */
1933
Daniel Veillard97ac1312001-05-30 19:14:17 +00001934/************************************************************************
1935 * *
1936 * The real API used by libxml for on-the-fly conversion *
1937 * *
1938 ************************************************************************/
1939
Owen Taylor3473f882001-02-23 17:55:21 +00001940/**
1941 * xmlCharEncFirstLine:
1942 * @handler: char enconding transformation data structure
1943 * @out: an xmlBuffer for the output.
1944 * @in: an xmlBuffer for the input
1945 *
1946 * Front-end for the encoding handler input function, but handle only
1947 * the very first line, i.e. limit itself to 45 chars.
1948 *
1949 * Returns the number of byte written if success, or
1950 * -1 general error
1951 * -2 if the transcoding fails (for *in is not valid utf8 string or
1952 * the result of transformation can't fit into the encoding we want), or
1953 */
1954int
1955xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1956 xmlBufferPtr in) {
1957 int ret = -2;
1958 int written;
1959 int toconv;
1960
1961 if (handler == NULL) return(-1);
1962 if (out == NULL) return(-1);
1963 if (in == NULL) return(-1);
1964
1965 written = out->size - out->use;
1966 toconv = in->use;
1967 if (toconv * 2 >= written) {
1968 xmlBufferGrow(out, toconv);
1969 written = out->size - out->use - 1;
1970 }
1971
1972 /*
1973 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1974 * 45 chars should be sufficient to reach the end of the encoding
1975 * decalration without going too far inside the document content.
1976 */
1977 written = 45;
1978
1979 if (handler->input != NULL) {
1980 ret = handler->input(&out->content[out->use], &written,
1981 in->content, &toconv);
1982 xmlBufferShrink(in, toconv);
1983 out->use += written;
1984 out->content[out->use] = 0;
1985 }
1986#ifdef LIBXML_ICONV_ENABLED
1987 else if (handler->iconv_in != NULL) {
1988 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1989 &written, in->content, &toconv);
1990 xmlBufferShrink(in, toconv);
1991 out->use += written;
1992 out->content[out->use] = 0;
1993 if (ret == -1) ret = -3;
1994 }
1995#endif /* LIBXML_ICONV_ENABLED */
1996#ifdef DEBUG_ENCODING
1997 switch (ret) {
1998 case 0:
1999 xmlGenericError(xmlGenericErrorContext,
2000 "converted %d bytes to %d bytes of input\n",
2001 toconv, written);
2002 break;
2003 case -1:
2004 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2005 toconv, written, in->use);
2006 break;
2007 case -2:
2008 xmlGenericError(xmlGenericErrorContext,
2009 "input conversion failed due to input error\n");
2010 break;
2011 case -3:
2012 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2013 toconv, written, in->use);
2014 break;
2015 default:
2016 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2017 }
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002018#endif /* DEBUG_ENCODING */
Owen Taylor3473f882001-02-23 17:55:21 +00002019 /*
2020 * Ignore when input buffer is not on a boundary
2021 */
2022 if (ret == -3) ret = 0;
2023 if (ret == -1) ret = 0;
2024 return(ret);
2025}
2026
2027/**
2028 * xmlCharEncInFunc:
2029 * @handler: char enconding transformation data structure
2030 * @out: an xmlBuffer for the output.
2031 * @in: an xmlBuffer for the input
2032 *
2033 * Generic front-end for the encoding handler input function
2034 *
2035 * Returns the number of byte written if success, or
2036 * -1 general error
2037 * -2 if the transcoding fails (for *in is not valid utf8 string or
2038 * the result of transformation can't fit into the encoding we want), or
2039 */
2040int
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002041xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2042 xmlBufferPtr in)
2043{
Owen Taylor3473f882001-02-23 17:55:21 +00002044 int ret = -2;
2045 int written;
2046 int toconv;
2047
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002048 if (handler == NULL)
2049 return (-1);
2050 if (out == NULL)
2051 return (-1);
2052 if (in == NULL)
2053 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00002054
2055 toconv = in->use;
2056 if (toconv == 0)
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002057 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00002058 written = out->size - out->use;
2059 if (toconv * 2 >= written) {
2060 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002061 written = out->size - out->use - 1;
Owen Taylor3473f882001-02-23 17:55:21 +00002062 }
2063 if (handler->input != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002064 ret = handler->input(&out->content[out->use], &written,
2065 in->content, &toconv);
2066 xmlBufferShrink(in, toconv);
2067 out->use += written;
2068 out->content[out->use] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002069 }
2070#ifdef LIBXML_ICONV_ENABLED
2071 else if (handler->iconv_in != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002072 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2073 &written, in->content, &toconv);
2074 xmlBufferShrink(in, toconv);
2075 out->use += written;
2076 out->content[out->use] = 0;
2077 if (ret == -1)
2078 ret = -3;
Owen Taylor3473f882001-02-23 17:55:21 +00002079 }
2080#endif /* LIBXML_ICONV_ENABLED */
2081 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002082 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002083#ifdef DEBUG_ENCODING
2084 xmlGenericError(xmlGenericErrorContext,
2085 "converted %d bytes to %d bytes of input\n",
2086 toconv, written);
Owen Taylor3473f882001-02-23 17:55:21 +00002087#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002088 break;
2089 case -1:
2090#ifdef DEBUG_ENCODING
2091 xmlGenericError(xmlGenericErrorContext,
2092 "converted %d bytes to %d bytes of input, %d left\n",
2093 toconv, written, in->use);
2094#endif
2095 break;
2096 case -3:
2097#ifdef DEBUG_ENCODING
2098 xmlGenericError(xmlGenericErrorContext,
2099 "converted %d bytes to %d bytes of input, %d left\n",
2100 toconv, written, in->use);
2101#endif
2102 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002103 case -2:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002104 xmlGenericError(xmlGenericErrorContext,
2105 "input conversion failed due to input error\n");
2106 xmlGenericError(xmlGenericErrorContext,
2107 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2108 in->content[0], in->content[1],
2109 in->content[2], in->content[3]);
Owen Taylor3473f882001-02-23 17:55:21 +00002110 }
2111 /*
2112 * Ignore when input buffer is not on a boundary
2113 */
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002114 if (ret == -3)
2115 ret = 0;
2116 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00002117}
2118
2119/**
2120 * xmlCharEncOutFunc:
2121 * @handler: char enconding transformation data structure
2122 * @out: an xmlBuffer for the output.
2123 * @in: an xmlBuffer for the input
2124 *
2125 * Generic front-end for the encoding handler output function
2126 * a first call with @in == NULL has to be made firs to initiate the
2127 * output in case of non-stateless encoding needing to initiate their
2128 * state or the output (like the BOM in UTF16).
2129 * In case of UTF8 sequence conversion errors for the given encoder,
2130 * the content will be automatically remapped to a CharRef sequence.
2131 *
2132 * Returns the number of byte written if success, or
2133 * -1 general error
2134 * -2 if the transcoding fails (for *in is not valid utf8 string or
2135 * the result of transformation can't fit into the encoding we want), or
2136 */
2137int
2138xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2139 xmlBufferPtr in) {
2140 int ret = -2;
2141 int written;
2142 int writtentot = 0;
2143 int toconv;
2144 int output = 0;
2145
2146 if (handler == NULL) return(-1);
2147 if (out == NULL) return(-1);
2148
2149retry:
2150
2151 written = out->size - out->use;
2152
2153 /*
2154 * First specific handling of in = NULL, i.e. the initialization call
2155 */
2156 if (in == NULL) {
2157 toconv = 0;
2158 if (handler->output != NULL) {
2159 ret = handler->output(&out->content[out->use], &written,
2160 NULL, &toconv);
2161 out->use += written;
2162 out->content[out->use] = 0;
2163 }
2164#ifdef LIBXML_ICONV_ENABLED
2165 else if (handler->iconv_out != NULL) {
2166 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2167 &written, NULL, &toconv);
2168 out->use += written;
2169 out->content[out->use] = 0;
2170 }
2171#endif /* LIBXML_ICONV_ENABLED */
2172#ifdef DEBUG_ENCODING
2173 xmlGenericError(xmlGenericErrorContext,
2174 "initialized encoder\n");
2175#endif
2176 return(0);
2177 }
2178
2179 /*
2180 * Convertion itself.
2181 */
2182 toconv = in->use;
2183 if (toconv == 0)
2184 return(0);
2185 if (toconv * 2 >= written) {
2186 xmlBufferGrow(out, toconv * 2);
2187 written = out->size - out->use - 1;
2188 }
2189 if (handler->output != NULL) {
2190 ret = handler->output(&out->content[out->use], &written,
2191 in->content, &toconv);
2192 xmlBufferShrink(in, toconv);
2193 out->use += written;
2194 writtentot += written;
2195 out->content[out->use] = 0;
2196 }
2197#ifdef LIBXML_ICONV_ENABLED
2198 else if (handler->iconv_out != NULL) {
2199 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2200 &written, in->content, &toconv);
2201 xmlBufferShrink(in, toconv);
2202 out->use += written;
2203 writtentot += written;
2204 out->content[out->use] = 0;
2205 if (ret == -1) {
2206 if (written > 0) {
2207 /*
2208 * Can be a limitation of iconv
2209 */
2210 goto retry;
2211 }
2212 ret = -3;
2213 }
2214 }
2215#endif /* LIBXML_ICONV_ENABLED */
2216 else {
2217 xmlGenericError(xmlGenericErrorContext,
2218 "xmlCharEncOutFunc: no output function !\n");
2219 return(-1);
2220 }
2221
2222 if (ret >= 0) output += ret;
2223
2224 /*
2225 * Attempt to handle error cases
2226 */
2227 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002228 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002229#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002230 xmlGenericError(xmlGenericErrorContext,
2231 "converted %d bytes to %d bytes of output\n",
2232 toconv, written);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002233#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002234 break;
2235 case -1:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002236#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002237 xmlGenericError(xmlGenericErrorContext,
2238 "output conversion failed by lack of space\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002239#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002240 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002241 case -3:
2242 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2243 toconv, written, in->use);
2244 break;
2245 case -2: {
2246 int len = in->use;
2247 const xmlChar *utf = (const xmlChar *) in->content;
2248 int cur;
2249
2250 cur = xmlGetUTF8Char(utf, &len);
2251 if (cur > 0) {
2252 xmlChar charref[20];
2253
2254#ifdef DEBUG_ENCODING
2255 xmlGenericError(xmlGenericErrorContext,
2256 "handling output conversion error\n");
2257 xmlGenericError(xmlGenericErrorContext,
2258 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2259 in->content[0], in->content[1],
2260 in->content[2], in->content[3]);
2261#endif
2262 /*
2263 * Removes the UTF8 sequence, and replace it by a charref
2264 * and continue the transcoding phase, hoping the error
2265 * did not mangle the encoder state.
2266 */
Daniel Veillard16698282001-09-14 10:29:27 +00002267 sprintf((char *) charref, "&#%d;", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002268 xmlBufferShrink(in, len);
2269 xmlBufferAddHead(in, charref, -1);
2270
2271 goto retry;
2272 } else {
2273 xmlGenericError(xmlGenericErrorContext,
2274 "output conversion failed due to conv error\n");
2275 xmlGenericError(xmlGenericErrorContext,
2276 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2277 in->content[0], in->content[1],
2278 in->content[2], in->content[3]);
2279 in->content[0] = ' ';
2280 }
2281 break;
2282 }
2283 }
2284 return(ret);
2285}
2286
2287/**
2288 * xmlCharEncCloseFunc:
2289 * @handler: char enconding transformation data structure
2290 *
2291 * Generic front-end for hencoding handler close function
2292 *
2293 * Returns 0 if success, or -1 in case of error
2294 */
2295int
2296xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2297 int ret = 0;
2298 if (handler == NULL) return(-1);
2299 if (handler->name == NULL) return(-1);
2300#ifdef LIBXML_ICONV_ENABLED
2301 /*
2302 * Iconv handlers can be oused only once, free the whole block.
2303 * and the associated icon resources.
2304 */
2305 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2306 if (handler->name != NULL)
2307 xmlFree(handler->name);
2308 handler->name = NULL;
2309 if (handler->iconv_out != NULL) {
2310 if (iconv_close(handler->iconv_out))
2311 ret = -1;
2312 handler->iconv_out = NULL;
2313 }
2314 if (handler->iconv_in != NULL) {
2315 if (iconv_close(handler->iconv_in))
2316 ret = -1;
2317 handler->iconv_in = NULL;
2318 }
2319 xmlFree(handler);
2320 }
2321#endif /* LIBXML_ICONV_ENABLED */
2322#ifdef DEBUG_ENCODING
2323 if (ret)
2324 xmlGenericError(xmlGenericErrorContext,
2325 "failed to close the encoding handler\n");
2326 else
2327 xmlGenericError(xmlGenericErrorContext,
2328 "closed the encoding handler\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002329#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002330
Owen Taylor3473f882001-02-23 17:55:21 +00002331 return(ret);
2332}
2333