blob: 06ebd2a353d4084768a8bcadd8d7bb1bba74fcaa [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
Daniel Veillardc5d64342001-06-24 12:13:24 +000018 * daniel@veillard.com
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Bjorn Reese70a9da52001-04-21 16:57:29 +000026#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000027
Owen Taylor3473f882001-02-23 17:55:21 +000028#include <string.h>
29
30#ifdef HAVE_CTYPE_H
31#include <ctype.h>
32#endif
33#ifdef HAVE_STDLIB_H
34#include <stdlib.h>
35#endif
Owen Taylor3473f882001-02-23 17:55:21 +000036#ifdef LIBXML_ICONV_ENABLED
37#ifdef HAVE_ERRNO_H
38#include <errno.h>
39#endif
40#endif
41#include <libxml/encoding.h>
42#include <libxml/xmlmemory.h>
43#ifdef LIBXML_HTML_ENABLED
44#include <libxml/HTMLparser.h>
45#endif
Daniel Veillard64a411c2001-10-15 12:32:07 +000046#include <libxml/globals.h>
Daniel Veillarda4617b82001-11-04 20:19:12 +000047#include <libxml/xmlerror.h>
Owen Taylor3473f882001-02-23 17:55:21 +000048
Daniel Veillard22090732001-07-16 00:06:07 +000049static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
50static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +000051
52typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
53typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
54struct _xmlCharEncodingAlias {
55 const char *name;
56 const char *alias;
57};
58
59static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
60static int xmlCharEncodingAliasesNb = 0;
61static int xmlCharEncodingAliasesMax = 0;
62
63#ifdef LIBXML_ICONV_ENABLED
64#if 0
65#define DEBUG_ENCODING /* Define this to get encoding traces */
66#endif
67#endif
68
69static int xmlLittleEndian = 1;
70
Daniel Veillard97ac1312001-05-30 19:14:17 +000071/************************************************************************
72 * *
73 * Generic UTF8 handling routines *
74 * *
75 * From rfc2044: encoding of the Unicode values on UTF-8: *
76 * *
77 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
78 * 0000 0000-0000 007F 0xxxxxxx *
79 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
80 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
81 * *
82 * I hope we won't use values > 0xFFFF anytime soon ! *
83 * *
84 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000085
86/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000087 * xmlUTF8Strlen:
88 * @utf: a sequence of UTF-8 encoded bytes
89 *
Daniel Veillard60087f32001-10-10 09:45:09 +000090 * compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillarde043ee12001-04-16 14:08:07 +000091 * checking of the content of the string.
92 *
93 * Returns the number of characters in the string or -1 in case of error
94 */
95int
Daniel Veillard97ac1312001-05-30 19:14:17 +000096xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +000097 int ret = 0;
98
99 if (utf == NULL)
100 return(-1);
101
102 while (*utf != 0) {
103 if (utf[0] & 0x80) {
104 if ((utf[1] & 0xc0) != 0x80)
105 return(-1);
106 if ((utf[0] & 0xe0) == 0xe0) {
107 if ((utf[2] & 0xc0) != 0x80)
108 return(-1);
109 if ((utf[0] & 0xf0) == 0xf0) {
110 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
111 return(-1);
112 utf += 4;
113 } else {
114 utf += 3;
115 }
116 } else {
117 utf += 2;
118 }
119 } else {
120 utf++;
121 }
122 ret++;
123 }
124 return(ret);
125}
126
127/**
Owen Taylor3473f882001-02-23 17:55:21 +0000128 * xmlGetUTF8Char:
129 * @utf: a sequence of UTF-8 encoded bytes
130 * @len: a pointer to @bytes len
131 *
132 * Read one UTF8 Char from @utf
133 *
134 * Returns the char value or -1 in case of error and update @len with the
135 * number of bytes used
136 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000137static int
Owen Taylor3473f882001-02-23 17:55:21 +0000138xmlGetUTF8Char(const unsigned char *utf, int *len) {
139 unsigned int c;
140
141 if (utf == NULL)
142 goto error;
143 if (len == NULL)
144 goto error;
145 if (*len < 1)
146 goto error;
147
148 c = utf[0];
149 if (c & 0x80) {
150 if (*len < 2)
151 goto error;
152 if ((utf[1] & 0xc0) != 0x80)
153 goto error;
154 if ((c & 0xe0) == 0xe0) {
155 if (*len < 3)
156 goto error;
157 if ((utf[2] & 0xc0) != 0x80)
158 goto error;
159 if ((c & 0xf0) == 0xf0) {
160 if (*len < 4)
161 goto error;
162 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
163 goto error;
164 *len = 4;
165 /* 4-byte code */
166 c = (utf[0] & 0x7) << 18;
167 c |= (utf[1] & 0x3f) << 12;
168 c |= (utf[2] & 0x3f) << 6;
169 c |= utf[3] & 0x3f;
170 } else {
171 /* 3-byte code */
172 *len = 3;
173 c = (utf[0] & 0xf) << 12;
174 c |= (utf[1] & 0x3f) << 6;
175 c |= utf[2] & 0x3f;
176 }
177 } else {
178 /* 2-byte code */
179 *len = 2;
180 c = (utf[0] & 0x1f) << 6;
181 c |= utf[1] & 0x3f;
182 }
183 } else {
184 /* 1-byte code */
185 *len = 1;
186 }
187 return(c);
188
189error:
190 *len = 0;
191 return(-1);
192}
193
194/**
195 * xmlCheckUTF8: Check utf-8 string for legality.
196 * @utf: Pointer to putative utf-8 encoded string.
197 *
198 * Checks @utf for being valid utf-8. @utf is assumed to be
199 * null-terminated. This function is not super-strict, as it will
200 * allow longer utf-8 sequences than necessary. Note that Java is
201 * capable of producing these sequences if provoked. Also note, this
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000202 * routine checks for the 4-byte maximum size, but does not check for
Owen Taylor3473f882001-02-23 17:55:21 +0000203 * 0x10ffff maximum value.
204 *
205 * Return value: true if @utf is valid.
206 **/
207int
208xmlCheckUTF8(const unsigned char *utf)
209{
210 int ix;
211 unsigned char c;
212
213 for (ix = 0; (c = utf[ix]);) {
214 if (c & 0x80) {
215 if ((utf[ix + 1] & 0xc0) != 0x80)
216 return(0);
217 if ((c & 0xe0) == 0xe0) {
218 if ((utf[ix + 2] & 0xc0) != 0x80)
219 return(0);
220 if ((c & 0xf0) == 0xf0) {
221 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
222 return(0);
223 ix += 4;
224 /* 4-byte code */
225 } else
226 /* 3-byte code */
227 ix += 3;
228 } else
229 /* 2-byte code */
230 ix += 2;
231 } else
232 /* 1-byte code */
233 ix++;
234 }
235 return(1);
236}
237
238/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000239 * xmlUTF8Strsize:
240 * @utf: a sequence of UTF-8 encoded bytes
241 * @len: the number of characters in the array
242 *
243 * storage size of an UTF8 string
244 *
245 * Returns the storage size of
246 * the first 'len' characters of ARRAY
247 *
248 */
249
250int
251xmlUTF8Strsize(const xmlChar *utf, int len) {
252 const xmlChar *ptr=utf;
253 xmlChar ch;
254
255 if (len <= 0)
256 return(0);
257
258 while ( len-- > 0) {
259 if ( !*ptr )
260 break;
261 if ( (ch = *ptr++) & 0x80)
262 while ( (ch<<=1) & 0x80 )
263 ptr++;
264 }
265 return (ptr - utf);
266}
267
268
269/**
270 * xmlUTF8Strndup:
271 * @utf: the input UTF8 *
272 * @len: the len of @utf (in chars)
273 *
274 * a strndup for array of UTF8's
275 *
276 * Returns a new UTF8 * or NULL
277 */
278xmlChar *
279xmlUTF8Strndup(const xmlChar *utf, int len) {
280 xmlChar *ret;
281 int i;
282
283 if ((utf == NULL) || (len < 0)) return(NULL);
284 i = xmlUTF8Strsize(utf, len);
285 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
286 if (ret == NULL) {
287 xmlGenericError(xmlGenericErrorContext,
288 "malloc of %ld byte failed\n",
289 (len + 1) * (long)sizeof(xmlChar));
290 return(NULL);
291 }
292 memcpy(ret, utf, i * sizeof(xmlChar));
293 ret[i] = 0;
294 return(ret);
295}
296
297/**
298 * xmlUTF8Strpos:
299 * @utf: the input UTF8 *
300 * @pos: the position of the desired UTF8 char (in chars)
301 *
302 * a function to provide the equivalent of fetching a
303 * character from a string array
304 *
305 * Returns a pointer to the UTF8 character or NULL
306 */
307xmlChar *
308xmlUTF8Strpos(const xmlChar *utf, int pos) {
309 xmlChar ch;
310
311 if (utf == NULL) return(NULL);
312 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
313 return(NULL);
314 while (pos--) {
315 if ((ch=*utf++) == 0) return(NULL);
316 if ( ch & 0x80 ) {
317 /* if not simple ascii, verify proper format */
318 if ( (ch & 0xc0) != 0xc0 )
319 return(NULL);
320 /* then skip over remaining bytes for this char */
321 while ( (ch <<= 1) & 0x80 )
322 if ( (*utf++ & 0xc0) != 0x80 )
323 return(NULL);
324 }
325 }
326 return((xmlChar *)utf);
327}
328
329/**
330 * xmlUTF8Strloc:
331 * @utf: the input UTF8 *
332 * @utfchar: the UTF8 character to be found
333 *
334 * a function to provide relative location of a UTF8 char
335 *
336 * Returns the relative character position of the desired char
337 * or -1 if not found
338 */
339int
340xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
341 int i, size;
342 xmlChar ch;
343
344 if (utf==NULL || utfchar==NULL) return -1;
345 size = xmlUTF8Strsize(utfchar, 1);
346 for(i=0; (ch=*utf) != 0; i++) {
347 if (xmlStrncmp(utf, utfchar, size)==0)
348 return(i);
349 utf++;
350 if ( ch & 0x80 ) {
351 /* if not simple ascii, verify proper format */
352 if ( (ch & 0xc0) != 0xc0 )
353 return(-1);
354 /* then skip over remaining bytes for this char */
355 while ( (ch <<= 1) & 0x80 )
356 if ( (*utf++ & 0xc0) != 0x80 )
357 return(-1);
358 }
359 }
360
361 return(-1);
362}
363/**
364 * xmlUTF8Strsub:
365 * @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard97ac1312001-05-30 19:14:17 +0000366 * @start: relative pos of first char
367 * @len: total number to copy
368 *
369 * Note: positions are given in units of UTF-8 chars
370 *
371 * Returns a pointer to a newly created string
372 * or NULL if any problem
373 */
374
375xmlChar *
376xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
377 int i;
378 xmlChar ch;
379
380 if (utf == NULL) return(NULL);
381 if (start < 0) return(NULL);
382 if (len < 0) return(NULL);
383
384 /*
385 * Skip over any leading chars
386 */
387 for (i = 0;i < start;i++) {
388 if ((ch=*utf++) == 0) return(NULL);
389 if ( ch & 0x80 ) {
390 /* if not simple ascii, verify proper format */
391 if ( (ch & 0xc0) != 0xc0 )
392 return(NULL);
393 /* then skip over remaining bytes for this char */
394 while ( (ch <<= 1) & 0x80 )
395 if ( (*utf++ & 0xc0) != 0x80 )
396 return(NULL);
397 }
398 }
399
400 return(xmlUTF8Strndup(utf, len));
401}
402
403/************************************************************************
404 * *
405 * Conversions To/From UTF8 encoding *
406 * *
407 ************************************************************************/
408
409/**
Owen Taylor3473f882001-02-23 17:55:21 +0000410 * asciiToUTF8:
411 * @out: a pointer to an array of bytes to store the result
412 * @outlen: the length of @out
413 * @in: a pointer to an array of ASCII chars
414 * @inlen: the length of @in
415 *
416 * Take a block of ASCII chars in and try to convert it to an UTF-8
417 * block of chars out.
418 * Returns 0 if success, or -1 otherwise
419 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000420 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000421 * The value of @outlen after return is the number of ocetes consumed.
422 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000423static int
Owen Taylor3473f882001-02-23 17:55:21 +0000424asciiToUTF8(unsigned char* out, int *outlen,
425 const unsigned char* in, int *inlen) {
426 unsigned char* outstart = out;
427 const unsigned char* base = in;
428 const unsigned char* processed = in;
429 unsigned char* outend = out + *outlen;
430 const unsigned char* inend;
431 unsigned int c;
432 int bits;
433
434 inend = in + (*inlen);
435 while ((in < inend) && (out - outstart + 5 < *outlen)) {
436 c= *in++;
437
438 /* assertion: c is a single UTF-4 value */
439 if (out >= outend)
440 break;
441 if (c < 0x80) { *out++= c; bits= -6; }
442 else {
443 *outlen = out - outstart;
444 *inlen = processed - base;
445 return(-1);
446 }
447
448 for ( ; bits >= 0; bits-= 6) {
449 if (out >= outend)
450 break;
451 *out++= ((c >> bits) & 0x3F) | 0x80;
452 }
453 processed = (const unsigned char*) in;
454 }
455 *outlen = out - outstart;
456 *inlen = processed - base;
457 return(0);
458}
459
460/**
461 * UTF8Toascii:
462 * @out: a pointer to an array of bytes to store the result
463 * @outlen: the length of @out
464 * @in: a pointer to an array of UTF-8 chars
465 * @inlen: the length of @in
466 *
467 * Take a block of UTF-8 chars in and try to convert it to an ASCII
468 * block of chars out.
469 *
470 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
471 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000472 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000473 * The value of @outlen after return is the number of ocetes consumed.
474 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000475static int
Owen Taylor3473f882001-02-23 17:55:21 +0000476UTF8Toascii(unsigned char* out, int *outlen,
477 const unsigned char* in, int *inlen) {
478 const unsigned char* processed = in;
479 const unsigned char* outend;
480 const unsigned char* outstart = out;
481 const unsigned char* instart = in;
482 const unsigned char* inend;
483 unsigned int c, d;
484 int trailing;
485
486 if (in == NULL) {
487 /*
488 * initialization nothing to do
489 */
490 *outlen = 0;
491 *inlen = 0;
492 return(0);
493 }
494 inend = in + (*inlen);
495 outend = out + (*outlen);
496 while (in < inend) {
497 d = *in++;
498 if (d < 0x80) { c= d; trailing= 0; }
499 else if (d < 0xC0) {
500 /* trailing byte in leading position */
501 *outlen = out - outstart;
502 *inlen = processed - instart;
503 return(-2);
504 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
505 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
506 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
507 else {
508 /* no chance for this in Ascii */
509 *outlen = out - outstart;
510 *inlen = processed - instart;
511 return(-2);
512 }
513
514 if (inend - in < trailing) {
515 break;
516 }
517
518 for ( ; trailing; trailing--) {
519 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
520 break;
521 c <<= 6;
522 c |= d & 0x3F;
523 }
524
525 /* assertion: c is a single UTF-4 value */
526 if (c < 0x80) {
527 if (out >= outend)
528 break;
529 *out++ = c;
530 } else {
531 /* no chance for this in Ascii */
532 *outlen = out - outstart;
533 *inlen = processed - instart;
534 return(-2);
535 }
536 processed = in;
537 }
538 *outlen = out - outstart;
539 *inlen = processed - instart;
540 return(0);
541}
542
543/**
544 * isolat1ToUTF8:
545 * @out: a pointer to an array of bytes to store the result
546 * @outlen: the length of @out
547 * @in: a pointer to an array of ISO Latin 1 chars
548 * @inlen: the length of @in
549 *
550 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
551 * block of chars out.
552 * Returns 0 if success, or -1 otherwise
553 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000554 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000555 * The value of @outlen after return is the number of ocetes consumed.
556 */
557int
558isolat1ToUTF8(unsigned char* out, int *outlen,
559 const unsigned char* in, int *inlen) {
560 unsigned char* outstart = out;
561 const unsigned char* base = in;
562 const unsigned char* processed = in;
563 unsigned char* outend = out + *outlen;
564 const unsigned char* inend;
565 unsigned int c;
Owen Taylor3473f882001-02-23 17:55:21 +0000566
567 inend = in + (*inlen);
Daniel Veillard02141ea2001-04-30 11:46:40 +0000568 while (in < inend) {
569 c = *in++;
Owen Taylor3473f882001-02-23 17:55:21 +0000570
Owen Taylor3473f882001-02-23 17:55:21 +0000571 if (out >= outend)
572 break;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000573
574 if (c < 0x80) {
575 *out++ = c;
576 processed++;
577 continue;
578 } else {
579 *out++= ((c >> 6) & 0x1F) | 0xC0;
Owen Taylor3473f882001-02-23 17:55:21 +0000580 if (out >= outend)
Daniel Veillard02141ea2001-04-30 11:46:40 +0000581 break;
582 *out++= (c & 0x3F) | 0x80;
583 processed++;
Owen Taylor3473f882001-02-23 17:55:21 +0000584 }
Owen Taylor3473f882001-02-23 17:55:21 +0000585 }
586 *outlen = out - outstart;
587 *inlen = processed - base;
588 return(0);
589}
590
591/**
592 * UTF8Toisolat1:
593 * @out: a pointer to an array of bytes to store the result
594 * @outlen: the length of @out
595 * @in: a pointer to an array of UTF-8 chars
596 * @inlen: the length of @in
597 *
598 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
599 * block of chars out.
600 *
601 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
602 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000603 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000604 * The value of @outlen after return is the number of ocetes consumed.
605 */
606int
607UTF8Toisolat1(unsigned char* out, int *outlen,
608 const unsigned char* in, int *inlen) {
609 const unsigned char* processed = in;
610 const unsigned char* outend;
611 const unsigned char* outstart = out;
612 const unsigned char* instart = in;
613 const unsigned char* inend;
614 unsigned int c, d;
615 int trailing;
616
617 if (in == NULL) {
618 /*
619 * initialization nothing to do
620 */
621 *outlen = 0;
622 *inlen = 0;
623 return(0);
624 }
625 inend = in + (*inlen);
626 outend = out + (*outlen);
627 while (in < inend) {
628 d = *in++;
629 if (d < 0x80) { c= d; trailing= 0; }
630 else if (d < 0xC0) {
631 /* trailing byte in leading position */
632 *outlen = out - outstart;
633 *inlen = processed - instart;
634 return(-2);
635 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
636 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
637 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
638 else {
639 /* no chance for this in IsoLat1 */
640 *outlen = out - outstart;
641 *inlen = processed - instart;
642 return(-2);
643 }
644
645 if (inend - in < trailing) {
646 break;
647 }
648
649 for ( ; trailing; trailing--) {
650 if (in >= inend)
651 break;
652 if (((d= *in++) & 0xC0) != 0x80) {
653 *outlen = out - outstart;
654 *inlen = processed - instart;
655 return(-2);
656 }
657 c <<= 6;
658 c |= d & 0x3F;
659 }
660
661 /* assertion: c is a single UTF-4 value */
662 if (c <= 0xFF) {
663 if (out >= outend)
664 break;
665 *out++ = c;
666 } else {
667 /* no chance for this in IsoLat1 */
668 *outlen = out - outstart;
669 *inlen = processed - instart;
670 return(-2);
671 }
672 processed = in;
673 }
674 *outlen = out - outstart;
675 *inlen = processed - instart;
676 return(0);
677}
678
679/**
680 * UTF16LEToUTF8:
681 * @out: a pointer to an array of bytes to store the result
682 * @outlen: the length of @out
683 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
684 * @inlenb: the length of @in in UTF-16LE chars
685 *
686 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000687 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000688 * is the same between the native type of this machine and the
689 * inputed one.
690 *
691 * Returns the number of byte written, or -1 by lack of space, or -2
692 * if the transcoding fails (for *in is not valid utf16 string)
693 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000694 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000695 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000696static int
Owen Taylor3473f882001-02-23 17:55:21 +0000697UTF16LEToUTF8(unsigned char* out, int *outlen,
698 const unsigned char* inb, int *inlenb)
699{
700 unsigned char* outstart = out;
701 const unsigned char* processed = inb;
702 unsigned char* outend = out + *outlen;
703 unsigned short* in = (unsigned short*) inb;
704 unsigned short* inend;
705 unsigned int c, d, inlen;
706 unsigned char *tmp;
707 int bits;
708
709 if ((*inlenb % 2) == 1)
710 (*inlenb)--;
711 inlen = *inlenb / 2;
712 inend = in + inlen;
713 while ((in < inend) && (out - outstart + 5 < *outlen)) {
714 if (xmlLittleEndian) {
715 c= *in++;
716 } else {
717 tmp = (unsigned char *) in;
718 c = *tmp++;
719 c = c | (((unsigned int)*tmp) << 8);
720 in++;
721 }
722 if ((c & 0xFC00) == 0xD800) { /* surrogates */
723 if (in >= inend) { /* (in > inend) shouldn't happens */
724 break;
725 }
726 if (xmlLittleEndian) {
727 d = *in++;
728 } else {
729 tmp = (unsigned char *) in;
730 d = *tmp++;
731 d = d | (((unsigned int)*tmp) << 8);
732 in++;
733 }
734 if ((d & 0xFC00) == 0xDC00) {
735 c &= 0x03FF;
736 c <<= 10;
737 c |= d & 0x03FF;
738 c += 0x10000;
739 }
740 else {
741 *outlen = out - outstart;
742 *inlenb = processed - inb;
743 return(-2);
744 }
745 }
746
747 /* assertion: c is a single UTF-4 value */
748 if (out >= outend)
749 break;
750 if (c < 0x80) { *out++= c; bits= -6; }
751 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
752 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
753 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
754
755 for ( ; bits >= 0; bits-= 6) {
756 if (out >= outend)
757 break;
758 *out++= ((c >> bits) & 0x3F) | 0x80;
759 }
760 processed = (const unsigned char*) in;
761 }
762 *outlen = out - outstart;
763 *inlenb = processed - inb;
764 return(0);
765}
766
767/**
768 * UTF8ToUTF16LE:
769 * @outb: a pointer to an array of bytes to store the result
770 * @outlen: the length of @outb
771 * @in: a pointer to an array of UTF-8 chars
772 * @inlen: the length of @in
773 *
774 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
775 * block of chars out.
776 *
777 * Returns the number of byte written, or -1 by lack of space, or -2
778 * if the transcoding failed.
779 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000780static int
Owen Taylor3473f882001-02-23 17:55:21 +0000781UTF8ToUTF16LE(unsigned char* outb, int *outlen,
782 const unsigned char* in, int *inlen)
783{
784 unsigned short* out = (unsigned short*) outb;
785 const unsigned char* processed = in;
786 unsigned short* outstart= out;
787 unsigned short* outend;
788 const unsigned char* inend= in+*inlen;
789 unsigned int c, d;
790 int trailing;
791 unsigned char *tmp;
792 unsigned short tmp1, tmp2;
793
794 if (in == NULL) {
795 /*
796 * initialization, add the Byte Order Mark
797 */
798 if (*outlen >= 2) {
799 outb[0] = 0xFF;
800 outb[1] = 0xFE;
801 *outlen = 2;
802 *inlen = 0;
803#ifdef DEBUG_ENCODING
804 xmlGenericError(xmlGenericErrorContext,
805 "Added FFFE Byte Order Mark\n");
806#endif
807 return(2);
808 }
809 *outlen = 0;
810 *inlen = 0;
811 return(0);
812 }
813 outend = out + (*outlen / 2);
814 while (in < inend) {
815 d= *in++;
816 if (d < 0x80) { c= d; trailing= 0; }
817 else if (d < 0xC0) {
818 /* trailing byte in leading position */
819 *outlen = (out - outstart) * 2;
820 *inlen = processed - in;
821 return(-2);
822 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
823 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
824 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
825 else {
826 /* no chance for this in UTF-16 */
827 *outlen = (out - outstart) * 2;
828 *inlen = processed - in;
829 return(-2);
830 }
831
832 if (inend - in < trailing) {
833 break;
834 }
835
836 for ( ; trailing; trailing--) {
837 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
838 break;
839 c <<= 6;
840 c |= d & 0x3F;
841 }
842
843 /* assertion: c is a single UTF-4 value */
844 if (c < 0x10000) {
845 if (out >= outend)
846 break;
847 if (xmlLittleEndian) {
848 *out++ = c;
849 } else {
850 tmp = (unsigned char *) out;
851 *tmp = c ;
852 *(tmp + 1) = c >> 8 ;
853 out++;
854 }
855 }
856 else if (c < 0x110000) {
857 if (out+1 >= outend)
858 break;
859 c -= 0x10000;
860 if (xmlLittleEndian) {
861 *out++ = 0xD800 | (c >> 10);
862 *out++ = 0xDC00 | (c & 0x03FF);
863 } else {
864 tmp1 = 0xD800 | (c >> 10);
865 tmp = (unsigned char *) out;
866 *tmp = (unsigned char) tmp1;
867 *(tmp + 1) = tmp1 >> 8;
868 out++;
869
870 tmp2 = 0xDC00 | (c & 0x03FF);
871 tmp = (unsigned char *) out;
872 *tmp = (unsigned char) tmp2;
873 *(tmp + 1) = tmp2 >> 8;
874 out++;
875 }
876 }
877 else
878 break;
879 processed = in;
880 }
881 *outlen = (out - outstart) * 2;
882 *inlen = processed - in;
883 return(0);
884}
885
886/**
887 * UTF16BEToUTF8:
888 * @out: a pointer to an array of bytes to store the result
889 * @outlen: the length of @out
890 * @inb: a pointer to an array of UTF-16 passwd as a byte array
891 * @inlenb: the length of @in in UTF-16 chars
892 *
893 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000894 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000895 * is the same between the native type of this machine and the
896 * inputed one.
897 *
898 * Returns the number of byte written, or -1 by lack of space, or -2
899 * if the transcoding fails (for *in is not valid utf16 string)
900 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000901 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000902 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000903static int
Owen Taylor3473f882001-02-23 17:55:21 +0000904UTF16BEToUTF8(unsigned char* out, int *outlen,
905 const unsigned char* inb, int *inlenb)
906{
907 unsigned char* outstart = out;
908 const unsigned char* processed = inb;
909 unsigned char* outend = out + *outlen;
910 unsigned short* in = (unsigned short*) inb;
911 unsigned short* inend;
912 unsigned int c, d, inlen;
913 unsigned char *tmp;
914 int bits;
915
916 if ((*inlenb % 2) == 1)
917 (*inlenb)--;
918 inlen = *inlenb / 2;
919 inend= in + inlen;
920 while (in < inend) {
921 if (xmlLittleEndian) {
922 tmp = (unsigned char *) in;
923 c = *tmp++;
924 c = c << 8;
925 c = c | (unsigned int) *tmp;
926 in++;
927 } else {
928 c= *in++;
929 }
930 if ((c & 0xFC00) == 0xD800) { /* surrogates */
931 if (in >= inend) { /* (in > inend) shouldn't happens */
932 *outlen = out - outstart;
933 *inlenb = processed - inb;
934 return(-2);
935 }
936 if (xmlLittleEndian) {
937 tmp = (unsigned char *) in;
938 d = *tmp++;
939 d = d << 8;
940 d = d | (unsigned int) *tmp;
941 in++;
942 } else {
943 d= *in++;
944 }
945 if ((d & 0xFC00) == 0xDC00) {
946 c &= 0x03FF;
947 c <<= 10;
948 c |= d & 0x03FF;
949 c += 0x10000;
950 }
951 else {
952 *outlen = out - outstart;
953 *inlenb = processed - inb;
954 return(-2);
955 }
956 }
957
958 /* assertion: c is a single UTF-4 value */
959 if (out >= outend)
960 break;
961 if (c < 0x80) { *out++= c; bits= -6; }
962 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
963 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
964 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
965
966 for ( ; bits >= 0; bits-= 6) {
967 if (out >= outend)
968 break;
969 *out++= ((c >> bits) & 0x3F) | 0x80;
970 }
971 processed = (const unsigned char*) in;
972 }
973 *outlen = out - outstart;
974 *inlenb = processed - inb;
975 return(0);
976}
977
978/**
979 * UTF8ToUTF16BE:
980 * @outb: a pointer to an array of bytes to store the result
981 * @outlen: the length of @outb
982 * @in: a pointer to an array of UTF-8 chars
983 * @inlen: the length of @in
984 *
985 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
986 * block of chars out.
987 *
988 * Returns the number of byte written, or -1 by lack of space, or -2
989 * if the transcoding failed.
990 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000991static int
Owen Taylor3473f882001-02-23 17:55:21 +0000992UTF8ToUTF16BE(unsigned char* outb, int *outlen,
993 const unsigned char* in, int *inlen)
994{
995 unsigned short* out = (unsigned short*) outb;
996 const unsigned char* processed = in;
997 unsigned short* outstart= out;
998 unsigned short* outend;
999 const unsigned char* inend= in+*inlen;
1000 unsigned int c, d;
1001 int trailing;
1002 unsigned char *tmp;
1003 unsigned short tmp1, tmp2;
1004
1005 if (in == NULL) {
1006 /*
1007 * initialization, add the Byte Order Mark
1008 */
1009 if (*outlen >= 2) {
1010 outb[0] = 0xFE;
1011 outb[1] = 0xFF;
1012 *outlen = 2;
1013 *inlen = 0;
1014#ifdef DEBUG_ENCODING
1015 xmlGenericError(xmlGenericErrorContext,
1016 "Added FEFF Byte Order Mark\n");
1017#endif
1018 return(2);
1019 }
1020 *outlen = 0;
1021 *inlen = 0;
1022 return(0);
1023 }
1024 outend = out + (*outlen / 2);
1025 while (in < inend) {
1026 d= *in++;
1027 if (d < 0x80) { c= d; trailing= 0; }
1028 else if (d < 0xC0) {
1029 /* trailing byte in leading position */
1030 *outlen = out - outstart;
1031 *inlen = processed - in;
1032 return(-2);
1033 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1034 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1035 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1036 else {
1037 /* no chance for this in UTF-16 */
1038 *outlen = out - outstart;
1039 *inlen = processed - in;
1040 return(-2);
1041 }
1042
1043 if (inend - in < trailing) {
1044 break;
1045 }
1046
1047 for ( ; trailing; trailing--) {
1048 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1049 c <<= 6;
1050 c |= d & 0x3F;
1051 }
1052
1053 /* assertion: c is a single UTF-4 value */
1054 if (c < 0x10000) {
1055 if (out >= outend) break;
1056 if (xmlLittleEndian) {
1057 tmp = (unsigned char *) out;
1058 *tmp = c >> 8;
1059 *(tmp + 1) = c;
1060 out++;
1061 } else {
1062 *out++ = c;
1063 }
1064 }
1065 else if (c < 0x110000) {
1066 if (out+1 >= outend) break;
1067 c -= 0x10000;
1068 if (xmlLittleEndian) {
1069 tmp1 = 0xD800 | (c >> 10);
1070 tmp = (unsigned char *) out;
1071 *tmp = tmp1 >> 8;
1072 *(tmp + 1) = (unsigned char) tmp1;
1073 out++;
1074
1075 tmp2 = 0xDC00 | (c & 0x03FF);
1076 tmp = (unsigned char *) out;
1077 *tmp = tmp2 >> 8;
1078 *(tmp + 1) = (unsigned char) tmp2;
1079 out++;
1080 } else {
1081 *out++ = 0xD800 | (c >> 10);
1082 *out++ = 0xDC00 | (c & 0x03FF);
1083 }
1084 }
1085 else
1086 break;
1087 processed = in;
1088 }
1089 *outlen = (out - outstart) * 2;
1090 *inlen = processed - in;
1091 return(0);
1092}
1093
Daniel Veillard97ac1312001-05-30 19:14:17 +00001094/************************************************************************
1095 * *
1096 * Generic encoding handling routines *
1097 * *
1098 ************************************************************************/
1099
Owen Taylor3473f882001-02-23 17:55:21 +00001100/**
1101 * xmlDetectCharEncoding:
1102 * @in: a pointer to the first bytes of the XML entity, must be at least
1103 * 4 bytes long.
1104 * @len: pointer to the length of the buffer
1105 *
1106 * Guess the encoding of the entity using the first bytes of the entity content
1107 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1108 *
1109 * Returns one of the XML_CHAR_ENCODING_... values.
1110 */
1111xmlCharEncoding
1112xmlDetectCharEncoding(const unsigned char* in, int len)
1113{
1114 if (len >= 4) {
1115 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1116 (in[2] == 0x00) && (in[3] == 0x3C))
1117 return(XML_CHAR_ENCODING_UCS4BE);
1118 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1119 (in[2] == 0x00) && (in[3] == 0x00))
1120 return(XML_CHAR_ENCODING_UCS4LE);
1121 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1122 (in[2] == 0x3C) && (in[3] == 0x00))
1123 return(XML_CHAR_ENCODING_UCS4_2143);
1124 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1125 (in[2] == 0x00) && (in[3] == 0x00))
1126 return(XML_CHAR_ENCODING_UCS4_3412);
1127 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1128 (in[2] == 0xA7) && (in[3] == 0x94))
1129 return(XML_CHAR_ENCODING_EBCDIC);
1130 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1131 (in[2] == 0x78) && (in[3] == 0x6D))
1132 return(XML_CHAR_ENCODING_UTF8);
1133 }
Daniel Veillard87a764e2001-06-20 17:41:10 +00001134 if (len >= 3) {
1135 /*
1136 * Errata on XML-1.0 June 20 2001
1137 * We now allow an UTF8 encoded BOM
1138 */
1139 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1140 (in[2] == 0xBF))
1141 return(XML_CHAR_ENCODING_UTF8);
1142 }
Owen Taylor3473f882001-02-23 17:55:21 +00001143 if (len >= 2) {
1144 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1145 return(XML_CHAR_ENCODING_UTF16BE);
1146 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1147 return(XML_CHAR_ENCODING_UTF16LE);
1148 }
1149 return(XML_CHAR_ENCODING_NONE);
1150}
1151
1152/**
1153 * xmlCleanupEncodingAliases:
1154 *
1155 * Unregisters all aliases
1156 */
1157void
1158xmlCleanupEncodingAliases(void) {
1159 int i;
1160
1161 if (xmlCharEncodingAliases == NULL)
1162 return;
1163
1164 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1165 if (xmlCharEncodingAliases[i].name != NULL)
1166 xmlFree((char *) xmlCharEncodingAliases[i].name);
1167 if (xmlCharEncodingAliases[i].alias != NULL)
1168 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1169 }
1170 xmlCharEncodingAliasesNb = 0;
1171 xmlCharEncodingAliasesMax = 0;
1172 xmlFree(xmlCharEncodingAliases);
Daniel Veillard73c6e532002-01-08 13:15:33 +00001173 xmlCharEncodingAliases = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001174}
1175
1176/**
1177 * xmlGetEncodingAlias:
1178 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1179 *
1180 * Lookup an encoding name for the given alias.
1181 *
1182 * Returns NULL if not found the original name otherwise
1183 */
1184const char *
1185xmlGetEncodingAlias(const char *alias) {
1186 int i;
1187 char upper[100];
1188
1189 if (alias == NULL)
1190 return(NULL);
1191
1192 if (xmlCharEncodingAliases == NULL)
1193 return(NULL);
1194
1195 for (i = 0;i < 99;i++) {
1196 upper[i] = toupper(alias[i]);
1197 if (upper[i] == 0) break;
1198 }
1199 upper[i] = 0;
1200
1201 /*
1202 * Walk down the list looking for a definition of the alias
1203 */
1204 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1205 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1206 return(xmlCharEncodingAliases[i].name);
1207 }
1208 }
1209 return(NULL);
1210}
1211
1212/**
1213 * xmlAddEncodingAlias:
1214 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1215 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1216 *
1217 * Registers and alias @alias for an encoding named @name. Existing alias
1218 * will be overwritten.
1219 *
1220 * Returns 0 in case of success, -1 in case of error
1221 */
1222int
1223xmlAddEncodingAlias(const char *name, const char *alias) {
1224 int i;
1225 char upper[100];
1226
1227 if ((name == NULL) || (alias == NULL))
1228 return(-1);
1229
1230 for (i = 0;i < 99;i++) {
1231 upper[i] = toupper(alias[i]);
1232 if (upper[i] == 0) break;
1233 }
1234 upper[i] = 0;
1235
1236 if (xmlCharEncodingAliases == NULL) {
1237 xmlCharEncodingAliasesNb = 0;
1238 xmlCharEncodingAliasesMax = 20;
1239 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1240 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1241 if (xmlCharEncodingAliases == NULL)
1242 return(-1);
1243 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1244 xmlCharEncodingAliasesMax *= 2;
1245 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1246 xmlRealloc(xmlCharEncodingAliases,
1247 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1248 }
1249 /*
1250 * Walk down the list looking for a definition of the alias
1251 */
1252 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1253 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1254 /*
1255 * Replace the definition.
1256 */
1257 xmlFree((char *) xmlCharEncodingAliases[i].name);
1258 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1259 return(0);
1260 }
1261 }
1262 /*
1263 * Add the definition
1264 */
1265 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1266 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1267 xmlCharEncodingAliasesNb++;
1268 return(0);
1269}
1270
1271/**
1272 * xmlDelEncodingAlias:
1273 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1274 *
1275 * Unregisters an encoding alias @alias
1276 *
1277 * Returns 0 in case of success, -1 in case of error
1278 */
1279int
1280xmlDelEncodingAlias(const char *alias) {
1281 int i;
1282
1283 if (alias == NULL)
1284 return(-1);
1285
1286 if (xmlCharEncodingAliases == NULL)
1287 return(-1);
1288 /*
1289 * Walk down the list looking for a definition of the alias
1290 */
1291 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1292 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1293 xmlFree((char *) xmlCharEncodingAliases[i].name);
1294 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1295 xmlCharEncodingAliasesNb--;
1296 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1297 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1298 return(0);
1299 }
1300 }
1301 return(-1);
1302}
1303
1304/**
1305 * xmlParseCharEncoding:
1306 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1307 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001308 * Compare the string to the known encoding schemes already known. Note
Owen Taylor3473f882001-02-23 17:55:21 +00001309 * that the comparison is case insensitive accordingly to the section
1310 * [XML] 4.3.3 Character Encoding in Entities.
1311 *
1312 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1313 * if not recognized.
1314 */
1315xmlCharEncoding
1316xmlParseCharEncoding(const char* name)
1317{
1318 const char *alias;
1319 char upper[500];
1320 int i;
1321
1322 if (name == NULL)
1323 return(XML_CHAR_ENCODING_NONE);
1324
1325 /*
1326 * Do the alias resolution
1327 */
1328 alias = xmlGetEncodingAlias(name);
1329 if (alias != NULL)
1330 name = alias;
1331
1332 for (i = 0;i < 499;i++) {
1333 upper[i] = toupper(name[i]);
1334 if (upper[i] == 0) break;
1335 }
1336 upper[i] = 0;
1337
1338 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1339 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1340 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1341
1342 /*
1343 * NOTE: if we were able to parse this, the endianness of UTF16 is
1344 * already found and in use
1345 */
1346 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1347 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1348
1349 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1350 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1351 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1352
1353 /*
1354 * NOTE: if we were able to parse this, the endianness of UCS4 is
1355 * already found and in use
1356 */
1357 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1358 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1359 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1360
1361
1362 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1363 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1364 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1365
1366 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1367 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1368 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1369
1370 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1371 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1372 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1373 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1374 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1375 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1376 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1377
1378 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1379 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1380 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1381
1382#ifdef DEBUG_ENCODING
1383 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1384#endif
1385 return(XML_CHAR_ENCODING_ERROR);
1386}
1387
1388/**
1389 * xmlGetCharEncodingName:
1390 * @enc: the encoding
1391 *
1392 * The "canonical" name for XML encoding.
1393 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1394 * Section 4.3.3 Character Encoding in Entities
1395 *
1396 * Returns the canonical name for the given encoding
1397 */
1398
1399const char*
1400xmlGetCharEncodingName(xmlCharEncoding enc) {
1401 switch (enc) {
1402 case XML_CHAR_ENCODING_ERROR:
1403 return(NULL);
1404 case XML_CHAR_ENCODING_NONE:
1405 return(NULL);
1406 case XML_CHAR_ENCODING_UTF8:
1407 return("UTF-8");
1408 case XML_CHAR_ENCODING_UTF16LE:
1409 return("UTF-16");
1410 case XML_CHAR_ENCODING_UTF16BE:
1411 return("UTF-16");
1412 case XML_CHAR_ENCODING_EBCDIC:
1413 return("EBCDIC");
1414 case XML_CHAR_ENCODING_UCS4LE:
1415 return("ISO-10646-UCS-4");
1416 case XML_CHAR_ENCODING_UCS4BE:
1417 return("ISO-10646-UCS-4");
1418 case XML_CHAR_ENCODING_UCS4_2143:
1419 return("ISO-10646-UCS-4");
1420 case XML_CHAR_ENCODING_UCS4_3412:
1421 return("ISO-10646-UCS-4");
1422 case XML_CHAR_ENCODING_UCS2:
1423 return("ISO-10646-UCS-2");
1424 case XML_CHAR_ENCODING_8859_1:
1425 return("ISO-8859-1");
1426 case XML_CHAR_ENCODING_8859_2:
1427 return("ISO-8859-2");
1428 case XML_CHAR_ENCODING_8859_3:
1429 return("ISO-8859-3");
1430 case XML_CHAR_ENCODING_8859_4:
1431 return("ISO-8859-4");
1432 case XML_CHAR_ENCODING_8859_5:
1433 return("ISO-8859-5");
1434 case XML_CHAR_ENCODING_8859_6:
1435 return("ISO-8859-6");
1436 case XML_CHAR_ENCODING_8859_7:
1437 return("ISO-8859-7");
1438 case XML_CHAR_ENCODING_8859_8:
1439 return("ISO-8859-8");
1440 case XML_CHAR_ENCODING_8859_9:
1441 return("ISO-8859-9");
1442 case XML_CHAR_ENCODING_2022_JP:
1443 return("ISO-2022-JP");
1444 case XML_CHAR_ENCODING_SHIFT_JIS:
1445 return("Shift-JIS");
1446 case XML_CHAR_ENCODING_EUC_JP:
1447 return("EUC-JP");
1448 case XML_CHAR_ENCODING_ASCII:
1449 return(NULL);
1450 }
1451 return(NULL);
1452}
1453
Daniel Veillard97ac1312001-05-30 19:14:17 +00001454/************************************************************************
1455 * *
1456 * Char encoding handlers *
1457 * *
1458 ************************************************************************/
1459
Owen Taylor3473f882001-02-23 17:55:21 +00001460
1461/* the size should be growable, but it's not a big deal ... */
1462#define MAX_ENCODING_HANDLERS 50
1463static xmlCharEncodingHandlerPtr *handlers = NULL;
1464static int nbCharEncodingHandler = 0;
1465
1466/*
1467 * The default is UTF-8 for XML, that's also the default used for the
1468 * parser internals, so the default encoding handler is NULL
1469 */
1470
1471static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1472
1473/**
1474 * xmlNewCharEncodingHandler:
1475 * @name: the encoding name, in UTF-8 format (ASCII actually)
1476 * @input: the xmlCharEncodingInputFunc to read that encoding
1477 * @output: the xmlCharEncodingOutputFunc to write that encoding
1478 *
1479 * Create and registers an xmlCharEncodingHandler.
1480 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1481 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001482static xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001483xmlNewCharEncodingHandler(const char *name,
1484 xmlCharEncodingInputFunc input,
1485 xmlCharEncodingOutputFunc output) {
1486 xmlCharEncodingHandlerPtr handler;
1487 const char *alias;
1488 char upper[500];
1489 int i;
1490 char *up = 0;
1491
1492 /*
1493 * Do the alias resolution
1494 */
1495 alias = xmlGetEncodingAlias(name);
1496 if (alias != NULL)
1497 name = alias;
1498
1499 /*
1500 * Keep only the uppercase version of the encoding.
1501 */
1502 if (name == NULL) {
1503 xmlGenericError(xmlGenericErrorContext,
1504 "xmlNewCharEncodingHandler : no name !\n");
1505 return(NULL);
1506 }
1507 for (i = 0;i < 499;i++) {
1508 upper[i] = toupper(name[i]);
1509 if (upper[i] == 0) break;
1510 }
1511 upper[i] = 0;
1512 up = xmlMemStrdup(upper);
1513 if (up == NULL) {
1514 xmlGenericError(xmlGenericErrorContext,
1515 "xmlNewCharEncodingHandler : out of memory !\n");
1516 return(NULL);
1517 }
1518
1519 /*
1520 * allocate and fill-up an handler block.
1521 */
1522 handler = (xmlCharEncodingHandlerPtr)
1523 xmlMalloc(sizeof(xmlCharEncodingHandler));
1524 if (handler == NULL) {
1525 xmlGenericError(xmlGenericErrorContext,
1526 "xmlNewCharEncodingHandler : out of memory !\n");
1527 return(NULL);
1528 }
1529 handler->input = input;
1530 handler->output = output;
1531 handler->name = up;
1532
1533#ifdef LIBXML_ICONV_ENABLED
1534 handler->iconv_in = NULL;
1535 handler->iconv_out = NULL;
1536#endif /* LIBXML_ICONV_ENABLED */
1537
1538 /*
1539 * registers and returns the handler.
1540 */
1541 xmlRegisterCharEncodingHandler(handler);
1542#ifdef DEBUG_ENCODING
1543 xmlGenericError(xmlGenericErrorContext,
1544 "Registered encoding handler for %s\n", name);
1545#endif
1546 return(handler);
1547}
1548
1549/**
1550 * xmlInitCharEncodingHandlers:
1551 *
1552 * Initialize the char encoding support, it registers the default
1553 * encoding supported.
1554 * NOTE: while public, this function usually doesn't need to be called
1555 * in normal processing.
1556 */
1557void
1558xmlInitCharEncodingHandlers(void) {
1559 unsigned short int tst = 0x1234;
1560 unsigned char *ptr = (unsigned char *) &tst;
1561
1562 if (handlers != NULL) return;
1563
1564 handlers = (xmlCharEncodingHandlerPtr *)
1565 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1566
1567 if (*ptr == 0x12) xmlLittleEndian = 0;
1568 else if (*ptr == 0x34) xmlLittleEndian = 1;
1569 else xmlGenericError(xmlGenericErrorContext,
1570 "Odd problem at endianness detection\n");
1571
1572 if (handlers == NULL) {
1573 xmlGenericError(xmlGenericErrorContext,
1574 "xmlInitCharEncodingHandlers : out of memory !\n");
1575 return;
1576 }
1577 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1578 xmlUTF16LEHandler =
1579 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1580 xmlUTF16BEHandler =
1581 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1582 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1583 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001584 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001585#ifdef LIBXML_HTML_ENABLED
1586 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1587#endif
1588}
1589
1590/**
1591 * xmlCleanupCharEncodingHandlers:
1592 *
1593 * Cleanup the memory allocated for the char encoding support, it
1594 * unregisters all the encoding handlers and the aliases.
1595 */
1596void
1597xmlCleanupCharEncodingHandlers(void) {
1598 xmlCleanupEncodingAliases();
1599
1600 if (handlers == NULL) return;
1601
1602 for (;nbCharEncodingHandler > 0;) {
1603 nbCharEncodingHandler--;
1604 if (handlers[nbCharEncodingHandler] != NULL) {
1605 if (handlers[nbCharEncodingHandler]->name != NULL)
1606 xmlFree(handlers[nbCharEncodingHandler]->name);
1607 xmlFree(handlers[nbCharEncodingHandler]);
1608 }
1609 }
1610 xmlFree(handlers);
1611 handlers = NULL;
1612 nbCharEncodingHandler = 0;
1613 xmlDefaultCharEncodingHandler = NULL;
1614}
1615
1616/**
1617 * xmlRegisterCharEncodingHandler:
1618 * @handler: the xmlCharEncodingHandlerPtr handler block
1619 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001620 * Register the char encoding handler, surprising, isn't it ?
Owen Taylor3473f882001-02-23 17:55:21 +00001621 */
1622void
1623xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1624 if (handlers == NULL) xmlInitCharEncodingHandlers();
1625 if (handler == NULL) {
1626 xmlGenericError(xmlGenericErrorContext,
1627 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1628 return;
1629 }
1630
1631 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1632 xmlGenericError(xmlGenericErrorContext,
1633 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1634 xmlGenericError(xmlGenericErrorContext,
1635 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1636 return;
1637 }
1638 handlers[nbCharEncodingHandler++] = handler;
1639}
1640
1641/**
1642 * xmlGetCharEncodingHandler:
1643 * @enc: an xmlCharEncoding value.
1644 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001645 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001646 *
1647 * Returns the handler or NULL if not found
1648 */
1649xmlCharEncodingHandlerPtr
1650xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1651 xmlCharEncodingHandlerPtr handler;
1652
1653 if (handlers == NULL) xmlInitCharEncodingHandlers();
1654 switch (enc) {
1655 case XML_CHAR_ENCODING_ERROR:
1656 return(NULL);
1657 case XML_CHAR_ENCODING_NONE:
1658 return(NULL);
1659 case XML_CHAR_ENCODING_UTF8:
1660 return(NULL);
1661 case XML_CHAR_ENCODING_UTF16LE:
1662 return(xmlUTF16LEHandler);
1663 case XML_CHAR_ENCODING_UTF16BE:
1664 return(xmlUTF16BEHandler);
1665 case XML_CHAR_ENCODING_EBCDIC:
1666 handler = xmlFindCharEncodingHandler("EBCDIC");
1667 if (handler != NULL) return(handler);
1668 handler = xmlFindCharEncodingHandler("ebcdic");
1669 if (handler != NULL) return(handler);
1670 break;
1671 case XML_CHAR_ENCODING_UCS4BE:
1672 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1673 if (handler != NULL) return(handler);
1674 handler = xmlFindCharEncodingHandler("UCS-4");
1675 if (handler != NULL) return(handler);
1676 handler = xmlFindCharEncodingHandler("UCS4");
1677 if (handler != NULL) return(handler);
1678 break;
1679 case XML_CHAR_ENCODING_UCS4LE:
1680 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1681 if (handler != NULL) return(handler);
1682 handler = xmlFindCharEncodingHandler("UCS-4");
1683 if (handler != NULL) return(handler);
1684 handler = xmlFindCharEncodingHandler("UCS4");
1685 if (handler != NULL) return(handler);
1686 break;
1687 case XML_CHAR_ENCODING_UCS4_2143:
1688 break;
1689 case XML_CHAR_ENCODING_UCS4_3412:
1690 break;
1691 case XML_CHAR_ENCODING_UCS2:
1692 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1693 if (handler != NULL) return(handler);
1694 handler = xmlFindCharEncodingHandler("UCS-2");
1695 if (handler != NULL) return(handler);
1696 handler = xmlFindCharEncodingHandler("UCS2");
1697 if (handler != NULL) return(handler);
1698 break;
1699
1700 /*
1701 * We used to keep ISO Latin encodings native in the
1702 * generated data. This led to so many problems that
1703 * this has been removed. One can still change this
1704 * back by registering no-ops encoders for those
1705 */
1706 case XML_CHAR_ENCODING_8859_1:
1707 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1708 if (handler != NULL) return(handler);
1709 break;
1710 case XML_CHAR_ENCODING_8859_2:
1711 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1712 if (handler != NULL) return(handler);
1713 break;
1714 case XML_CHAR_ENCODING_8859_3:
1715 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1716 if (handler != NULL) return(handler);
1717 break;
1718 case XML_CHAR_ENCODING_8859_4:
1719 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1720 if (handler != NULL) return(handler);
1721 break;
1722 case XML_CHAR_ENCODING_8859_5:
1723 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1724 if (handler != NULL) return(handler);
1725 break;
1726 case XML_CHAR_ENCODING_8859_6:
1727 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1728 if (handler != NULL) return(handler);
1729 break;
1730 case XML_CHAR_ENCODING_8859_7:
1731 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1732 if (handler != NULL) return(handler);
1733 break;
1734 case XML_CHAR_ENCODING_8859_8:
1735 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1736 if (handler != NULL) return(handler);
1737 break;
1738 case XML_CHAR_ENCODING_8859_9:
1739 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1740 if (handler != NULL) return(handler);
1741 break;
1742
1743
1744 case XML_CHAR_ENCODING_2022_JP:
1745 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1746 if (handler != NULL) return(handler);
1747 break;
1748 case XML_CHAR_ENCODING_SHIFT_JIS:
1749 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1750 if (handler != NULL) return(handler);
1751 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1752 if (handler != NULL) return(handler);
1753 handler = xmlFindCharEncodingHandler("Shift_JIS");
1754 if (handler != NULL) return(handler);
1755 break;
1756 case XML_CHAR_ENCODING_EUC_JP:
1757 handler = xmlFindCharEncodingHandler("EUC-JP");
1758 if (handler != NULL) return(handler);
1759 break;
1760 default:
1761 break;
1762 }
1763
1764#ifdef DEBUG_ENCODING
1765 xmlGenericError(xmlGenericErrorContext,
1766 "No handler found for encoding %d\n", enc);
1767#endif
1768 return(NULL);
1769}
1770
1771/**
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001772 * xmlFindCharEncodingHandler:
1773 * @name: a string describing the char encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001774 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001775 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001776 *
1777 * Returns the handler or NULL if not found
1778 */
1779xmlCharEncodingHandlerPtr
1780xmlFindCharEncodingHandler(const char *name) {
1781 const char *nalias;
1782 const char *norig;
1783 xmlCharEncoding alias;
1784#ifdef LIBXML_ICONV_ENABLED
1785 xmlCharEncodingHandlerPtr enc;
1786 iconv_t icv_in, icv_out;
1787#endif /* LIBXML_ICONV_ENABLED */
1788 char upper[100];
1789 int i;
1790
1791 if (handlers == NULL) xmlInitCharEncodingHandlers();
1792 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1793 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1794
1795 /*
1796 * Do the alias resolution
1797 */
1798 norig = name;
1799 nalias = xmlGetEncodingAlias(name);
1800 if (nalias != NULL)
1801 name = nalias;
1802
1803 /*
1804 * Check first for directly registered encoding names
1805 */
1806 for (i = 0;i < 99;i++) {
1807 upper[i] = toupper(name[i]);
1808 if (upper[i] == 0) break;
1809 }
1810 upper[i] = 0;
1811
1812 for (i = 0;i < nbCharEncodingHandler; i++)
1813 if (!strcmp(upper, handlers[i]->name)) {
1814#ifdef DEBUG_ENCODING
1815 xmlGenericError(xmlGenericErrorContext,
1816 "Found registered handler for encoding %s\n", name);
1817#endif
1818 return(handlers[i]);
1819 }
1820
1821#ifdef LIBXML_ICONV_ENABLED
1822 /* check whether iconv can handle this */
1823 icv_in = iconv_open("UTF-8", name);
1824 icv_out = iconv_open(name, "UTF-8");
1825 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1826 enc = (xmlCharEncodingHandlerPtr)
1827 xmlMalloc(sizeof(xmlCharEncodingHandler));
1828 if (enc == NULL) {
1829 iconv_close(icv_in);
1830 iconv_close(icv_out);
1831 return(NULL);
1832 }
1833 enc->name = xmlMemStrdup(name);
1834 enc->input = NULL;
1835 enc->output = NULL;
1836 enc->iconv_in = icv_in;
1837 enc->iconv_out = icv_out;
1838#ifdef DEBUG_ENCODING
1839 xmlGenericError(xmlGenericErrorContext,
1840 "Found iconv handler for encoding %s\n", name);
1841#endif
1842 return enc;
1843 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1844 xmlGenericError(xmlGenericErrorContext,
1845 "iconv : problems with filters for '%s'\n", name);
1846 }
1847#endif /* LIBXML_ICONV_ENABLED */
1848
1849#ifdef DEBUG_ENCODING
1850 xmlGenericError(xmlGenericErrorContext,
1851 "No handler found for encoding %s\n", name);
1852#endif
1853
1854 /*
1855 * Fallback using the canonical names
1856 */
1857 alias = xmlParseCharEncoding(norig);
1858 if (alias != XML_CHAR_ENCODING_ERROR) {
1859 const char* canon;
1860 canon = xmlGetCharEncodingName(alias);
1861 if ((canon != NULL) && (strcmp(name, canon))) {
1862 return(xmlFindCharEncodingHandler(canon));
1863 }
1864 }
1865
1866 return(NULL);
1867}
1868
Daniel Veillard97ac1312001-05-30 19:14:17 +00001869/************************************************************************
1870 * *
1871 * ICONV based generic conversion functions *
1872 * *
1873 ************************************************************************/
1874
Owen Taylor3473f882001-02-23 17:55:21 +00001875#ifdef LIBXML_ICONV_ENABLED
1876/**
1877 * xmlIconvWrapper:
1878 * @cd: iconv converter data structure
1879 * @out: a pointer to an array of bytes to store the result
1880 * @outlen: the length of @out
1881 * @in: a pointer to an array of ISO Latin 1 chars
1882 * @inlen: the length of @in
1883 *
1884 * Returns 0 if success, or
1885 * -1 by lack of space, or
1886 * -2 if the transcoding fails (for *in is not valid utf8 string or
1887 * the result of transformation can't fit into the encoding we want), or
1888 * -3 if there the last byte can't form a single output char.
1889 *
1890 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001891 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001892 * The value of @outlen after return is the number of ocetes consumed.
1893 */
1894static int
1895xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001896 unsigned char *out, int *outlen,
1897 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001898
Daniel Veillard9403a042001-05-28 11:00:53 +00001899 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1900 const char *icv_in = (const char *) in;
1901 char *icv_out = (char *) out;
1902 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001903
Darin Adler699613b2001-07-27 22:47:14 +00001904 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard9403a042001-05-28 11:00:53 +00001905 if (in != NULL) {
1906 *inlen -= icv_inlen;
1907 *outlen -= icv_outlen;
1908 } else {
1909 *inlen = 0;
1910 *outlen = 0;
1911 }
1912 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001913#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00001914 if (errno == EILSEQ) {
1915 return -2;
1916 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001917#endif
1918#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00001919 if (errno == E2BIG) {
1920 return -1;
1921 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001922#endif
1923#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00001924 if (errno == EINVAL) {
1925 return -3;
1926 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001927#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00001928 {
1929 return -3;
1930 }
1931 }
1932 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001933}
1934#endif /* LIBXML_ICONV_ENABLED */
1935
Daniel Veillard97ac1312001-05-30 19:14:17 +00001936/************************************************************************
1937 * *
1938 * The real API used by libxml for on-the-fly conversion *
1939 * *
1940 ************************************************************************/
1941
Owen Taylor3473f882001-02-23 17:55:21 +00001942/**
1943 * xmlCharEncFirstLine:
1944 * @handler: char enconding transformation data structure
1945 * @out: an xmlBuffer for the output.
1946 * @in: an xmlBuffer for the input
1947 *
1948 * Front-end for the encoding handler input function, but handle only
1949 * the very first line, i.e. limit itself to 45 chars.
1950 *
1951 * Returns the number of byte written if success, or
1952 * -1 general error
1953 * -2 if the transcoding fails (for *in is not valid utf8 string or
1954 * the result of transformation can't fit into the encoding we want), or
1955 */
1956int
1957xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1958 xmlBufferPtr in) {
1959 int ret = -2;
1960 int written;
1961 int toconv;
1962
1963 if (handler == NULL) return(-1);
1964 if (out == NULL) return(-1);
1965 if (in == NULL) return(-1);
1966
1967 written = out->size - out->use;
1968 toconv = in->use;
1969 if (toconv * 2 >= written) {
1970 xmlBufferGrow(out, toconv);
1971 written = out->size - out->use - 1;
1972 }
1973
1974 /*
1975 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1976 * 45 chars should be sufficient to reach the end of the encoding
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001977 * declaration without going too far inside the document content.
Owen Taylor3473f882001-02-23 17:55:21 +00001978 */
1979 written = 45;
1980
1981 if (handler->input != NULL) {
1982 ret = handler->input(&out->content[out->use], &written,
1983 in->content, &toconv);
1984 xmlBufferShrink(in, toconv);
1985 out->use += written;
1986 out->content[out->use] = 0;
1987 }
1988#ifdef LIBXML_ICONV_ENABLED
1989 else if (handler->iconv_in != NULL) {
1990 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1991 &written, in->content, &toconv);
1992 xmlBufferShrink(in, toconv);
1993 out->use += written;
1994 out->content[out->use] = 0;
1995 if (ret == -1) ret = -3;
1996 }
1997#endif /* LIBXML_ICONV_ENABLED */
1998#ifdef DEBUG_ENCODING
1999 switch (ret) {
2000 case 0:
2001 xmlGenericError(xmlGenericErrorContext,
2002 "converted %d bytes to %d bytes of input\n",
2003 toconv, written);
2004 break;
2005 case -1:
2006 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2007 toconv, written, in->use);
2008 break;
2009 case -2:
2010 xmlGenericError(xmlGenericErrorContext,
2011 "input conversion failed due to input error\n");
2012 break;
2013 case -3:
2014 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2015 toconv, written, in->use);
2016 break;
2017 default:
2018 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2019 }
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002020#endif /* DEBUG_ENCODING */
Owen Taylor3473f882001-02-23 17:55:21 +00002021 /*
2022 * Ignore when input buffer is not on a boundary
2023 */
2024 if (ret == -3) ret = 0;
2025 if (ret == -1) ret = 0;
2026 return(ret);
2027}
2028
2029/**
2030 * xmlCharEncInFunc:
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002031 * @handler: char encoding transformation data structure
Owen Taylor3473f882001-02-23 17:55:21 +00002032 * @out: an xmlBuffer for the output.
2033 * @in: an xmlBuffer for the input
2034 *
2035 * Generic front-end for the encoding handler input function
2036 *
2037 * Returns the number of byte written if success, or
2038 * -1 general error
2039 * -2 if the transcoding fails (for *in is not valid utf8 string or
2040 * the result of transformation can't fit into the encoding we want), or
2041 */
2042int
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002043xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2044 xmlBufferPtr in)
2045{
Owen Taylor3473f882001-02-23 17:55:21 +00002046 int ret = -2;
2047 int written;
2048 int toconv;
2049
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002050 if (handler == NULL)
2051 return (-1);
2052 if (out == NULL)
2053 return (-1);
2054 if (in == NULL)
2055 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00002056
2057 toconv = in->use;
2058 if (toconv == 0)
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002059 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00002060 written = out->size - out->use;
2061 if (toconv * 2 >= written) {
2062 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002063 written = out->size - out->use - 1;
Owen Taylor3473f882001-02-23 17:55:21 +00002064 }
2065 if (handler->input != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002066 ret = handler->input(&out->content[out->use], &written,
2067 in->content, &toconv);
2068 xmlBufferShrink(in, toconv);
2069 out->use += written;
2070 out->content[out->use] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002071 }
2072#ifdef LIBXML_ICONV_ENABLED
2073 else if (handler->iconv_in != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002074 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2075 &written, in->content, &toconv);
2076 xmlBufferShrink(in, toconv);
2077 out->use += written;
2078 out->content[out->use] = 0;
2079 if (ret == -1)
2080 ret = -3;
Owen Taylor3473f882001-02-23 17:55:21 +00002081 }
2082#endif /* LIBXML_ICONV_ENABLED */
2083 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002084 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002085#ifdef DEBUG_ENCODING
2086 xmlGenericError(xmlGenericErrorContext,
2087 "converted %d bytes to %d bytes of input\n",
2088 toconv, written);
Owen Taylor3473f882001-02-23 17:55:21 +00002089#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002090 break;
2091 case -1:
2092#ifdef DEBUG_ENCODING
2093 xmlGenericError(xmlGenericErrorContext,
2094 "converted %d bytes to %d bytes of input, %d left\n",
2095 toconv, written, in->use);
2096#endif
2097 break;
2098 case -3:
2099#ifdef DEBUG_ENCODING
2100 xmlGenericError(xmlGenericErrorContext,
2101 "converted %d bytes to %d bytes of input, %d left\n",
2102 toconv, written, in->use);
2103#endif
2104 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002105 case -2:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002106 xmlGenericError(xmlGenericErrorContext,
2107 "input conversion failed due to input error\n");
2108 xmlGenericError(xmlGenericErrorContext,
2109 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2110 in->content[0], in->content[1],
2111 in->content[2], in->content[3]);
Owen Taylor3473f882001-02-23 17:55:21 +00002112 }
2113 /*
2114 * Ignore when input buffer is not on a boundary
2115 */
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002116 if (ret == -3)
2117 ret = 0;
2118 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00002119}
2120
2121/**
2122 * xmlCharEncOutFunc:
2123 * @handler: char enconding transformation data structure
2124 * @out: an xmlBuffer for the output.
2125 * @in: an xmlBuffer for the input
2126 *
2127 * Generic front-end for the encoding handler output function
2128 * a first call with @in == NULL has to be made firs to initiate the
2129 * output in case of non-stateless encoding needing to initiate their
2130 * state or the output (like the BOM in UTF16).
2131 * In case of UTF8 sequence conversion errors for the given encoder,
2132 * the content will be automatically remapped to a CharRef sequence.
2133 *
2134 * Returns the number of byte written if success, or
2135 * -1 general error
2136 * -2 if the transcoding fails (for *in is not valid utf8 string or
2137 * the result of transformation can't fit into the encoding we want), or
2138 */
2139int
2140xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2141 xmlBufferPtr in) {
2142 int ret = -2;
2143 int written;
2144 int writtentot = 0;
2145 int toconv;
2146 int output = 0;
2147
2148 if (handler == NULL) return(-1);
2149 if (out == NULL) return(-1);
2150
2151retry:
2152
2153 written = out->size - out->use;
2154
2155 /*
2156 * First specific handling of in = NULL, i.e. the initialization call
2157 */
2158 if (in == NULL) {
2159 toconv = 0;
2160 if (handler->output != NULL) {
2161 ret = handler->output(&out->content[out->use], &written,
2162 NULL, &toconv);
2163 out->use += written;
2164 out->content[out->use] = 0;
2165 }
2166#ifdef LIBXML_ICONV_ENABLED
2167 else if (handler->iconv_out != NULL) {
2168 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2169 &written, NULL, &toconv);
2170 out->use += written;
2171 out->content[out->use] = 0;
2172 }
2173#endif /* LIBXML_ICONV_ENABLED */
2174#ifdef DEBUG_ENCODING
2175 xmlGenericError(xmlGenericErrorContext,
2176 "initialized encoder\n");
2177#endif
2178 return(0);
2179 }
2180
2181 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002182 * Conversion itself.
Owen Taylor3473f882001-02-23 17:55:21 +00002183 */
2184 toconv = in->use;
2185 if (toconv == 0)
2186 return(0);
2187 if (toconv * 2 >= written) {
2188 xmlBufferGrow(out, toconv * 2);
2189 written = out->size - out->use - 1;
2190 }
2191 if (handler->output != NULL) {
2192 ret = handler->output(&out->content[out->use], &written,
2193 in->content, &toconv);
2194 xmlBufferShrink(in, toconv);
2195 out->use += written;
2196 writtentot += written;
2197 out->content[out->use] = 0;
2198 }
2199#ifdef LIBXML_ICONV_ENABLED
2200 else if (handler->iconv_out != NULL) {
2201 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2202 &written, in->content, &toconv);
2203 xmlBufferShrink(in, toconv);
2204 out->use += written;
2205 writtentot += written;
2206 out->content[out->use] = 0;
2207 if (ret == -1) {
2208 if (written > 0) {
2209 /*
2210 * Can be a limitation of iconv
2211 */
2212 goto retry;
2213 }
2214 ret = -3;
2215 }
2216 }
2217#endif /* LIBXML_ICONV_ENABLED */
2218 else {
2219 xmlGenericError(xmlGenericErrorContext,
2220 "xmlCharEncOutFunc: no output function !\n");
2221 return(-1);
2222 }
2223
2224 if (ret >= 0) output += ret;
2225
2226 /*
2227 * Attempt to handle error cases
2228 */
2229 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002230 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002231#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002232 xmlGenericError(xmlGenericErrorContext,
2233 "converted %d bytes to %d bytes of output\n",
2234 toconv, written);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002235#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002236 break;
2237 case -1:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002238#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002239 xmlGenericError(xmlGenericErrorContext,
2240 "output conversion failed by lack of space\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002241#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002242 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002243 case -3:
2244 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2245 toconv, written, in->use);
2246 break;
2247 case -2: {
2248 int len = in->use;
2249 const xmlChar *utf = (const xmlChar *) in->content;
2250 int cur;
2251
2252 cur = xmlGetUTF8Char(utf, &len);
2253 if (cur > 0) {
2254 xmlChar charref[20];
2255
2256#ifdef DEBUG_ENCODING
2257 xmlGenericError(xmlGenericErrorContext,
2258 "handling output conversion error\n");
2259 xmlGenericError(xmlGenericErrorContext,
2260 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2261 in->content[0], in->content[1],
2262 in->content[2], in->content[3]);
2263#endif
2264 /*
2265 * Removes the UTF8 sequence, and replace it by a charref
2266 * and continue the transcoding phase, hoping the error
2267 * did not mangle the encoder state.
2268 */
Daniel Veillard16698282001-09-14 10:29:27 +00002269 sprintf((char *) charref, "&#%d;", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002270 xmlBufferShrink(in, len);
2271 xmlBufferAddHead(in, charref, -1);
2272
2273 goto retry;
2274 } else {
2275 xmlGenericError(xmlGenericErrorContext,
2276 "output conversion failed due to conv error\n");
2277 xmlGenericError(xmlGenericErrorContext,
2278 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2279 in->content[0], in->content[1],
2280 in->content[2], in->content[3]);
2281 in->content[0] = ' ';
2282 }
2283 break;
2284 }
2285 }
2286 return(ret);
2287}
2288
2289/**
2290 * xmlCharEncCloseFunc:
2291 * @handler: char enconding transformation data structure
2292 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002293 * Generic front-end for encoding handler close function
Owen Taylor3473f882001-02-23 17:55:21 +00002294 *
2295 * Returns 0 if success, or -1 in case of error
2296 */
2297int
2298xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2299 int ret = 0;
2300 if (handler == NULL) return(-1);
2301 if (handler->name == NULL) return(-1);
2302#ifdef LIBXML_ICONV_ENABLED
2303 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002304 * Iconv handlers can be used only once, free the whole block.
Owen Taylor3473f882001-02-23 17:55:21 +00002305 * and the associated icon resources.
2306 */
2307 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2308 if (handler->name != NULL)
2309 xmlFree(handler->name);
2310 handler->name = NULL;
2311 if (handler->iconv_out != NULL) {
2312 if (iconv_close(handler->iconv_out))
2313 ret = -1;
2314 handler->iconv_out = NULL;
2315 }
2316 if (handler->iconv_in != NULL) {
2317 if (iconv_close(handler->iconv_in))
2318 ret = -1;
2319 handler->iconv_in = NULL;
2320 }
2321 xmlFree(handler);
2322 }
2323#endif /* LIBXML_ICONV_ENABLED */
2324#ifdef DEBUG_ENCODING
2325 if (ret)
2326 xmlGenericError(xmlGenericErrorContext,
2327 "failed to close the encoding handler\n");
2328 else
2329 xmlGenericError(xmlGenericErrorContext,
2330 "closed the encoding handler\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002331#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002332
Owen Taylor3473f882001-02-23 17:55:21 +00002333 return(ret);
2334}
2335