blob: df8714beae7c5c12b2f8598a50ae0e33cbbd4f31 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
Daniel Veillardc5d64342001-06-24 12:13:24 +000018 * daniel@veillard.com
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Daniel Veillard34ce8be2002-03-18 19:37:11 +000026#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000027#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000028
Owen Taylor3473f882001-02-23 17:55:21 +000029#include <string.h>
30
31#ifdef HAVE_CTYPE_H
32#include <ctype.h>
33#endif
34#ifdef HAVE_STDLIB_H
35#include <stdlib.h>
36#endif
Owen Taylor3473f882001-02-23 17:55:21 +000037#ifdef LIBXML_ICONV_ENABLED
38#ifdef HAVE_ERRNO_H
39#include <errno.h>
40#endif
41#endif
42#include <libxml/encoding.h>
43#include <libxml/xmlmemory.h>
44#ifdef LIBXML_HTML_ENABLED
45#include <libxml/HTMLparser.h>
46#endif
Daniel Veillard64a411c2001-10-15 12:32:07 +000047#include <libxml/globals.h>
Daniel Veillarda4617b82001-11-04 20:19:12 +000048#include <libxml/xmlerror.h>
Owen Taylor3473f882001-02-23 17:55:21 +000049
Daniel Veillard22090732001-07-16 00:06:07 +000050static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +000052
53typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
54typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
55struct _xmlCharEncodingAlias {
56 const char *name;
57 const char *alias;
58};
59
60static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
61static int xmlCharEncodingAliasesNb = 0;
62static int xmlCharEncodingAliasesMax = 0;
63
64#ifdef LIBXML_ICONV_ENABLED
65#if 0
66#define DEBUG_ENCODING /* Define this to get encoding traces */
67#endif
68#endif
69
70static int xmlLittleEndian = 1;
71
Daniel Veillard97ac1312001-05-30 19:14:17 +000072/************************************************************************
73 * *
74 * Generic UTF8 handling routines *
75 * *
76 * From rfc2044: encoding of the Unicode values on UTF-8: *
77 * *
78 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
79 * 0000 0000-0000 007F 0xxxxxxx *
80 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
81 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
82 * *
83 * I hope we won't use values > 0xFFFF anytime soon ! *
84 * *
85 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000086
87/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000088 * xmlUTF8Strlen:
89 * @utf: a sequence of UTF-8 encoded bytes
90 *
Daniel Veillard60087f32001-10-10 09:45:09 +000091 * compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillarde043ee12001-04-16 14:08:07 +000092 * checking of the content of the string.
93 *
94 * Returns the number of characters in the string or -1 in case of error
95 */
96int
Daniel Veillard97ac1312001-05-30 19:14:17 +000097xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +000098 int ret = 0;
99
100 if (utf == NULL)
101 return(-1);
102
103 while (*utf != 0) {
104 if (utf[0] & 0x80) {
105 if ((utf[1] & 0xc0) != 0x80)
106 return(-1);
107 if ((utf[0] & 0xe0) == 0xe0) {
108 if ((utf[2] & 0xc0) != 0x80)
109 return(-1);
110 if ((utf[0] & 0xf0) == 0xf0) {
111 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
112 return(-1);
113 utf += 4;
114 } else {
115 utf += 3;
116 }
117 } else {
118 utf += 2;
119 }
120 } else {
121 utf++;
122 }
123 ret++;
124 }
125 return(ret);
126}
127
128/**
Owen Taylor3473f882001-02-23 17:55:21 +0000129 * xmlGetUTF8Char:
130 * @utf: a sequence of UTF-8 encoded bytes
131 * @len: a pointer to @bytes len
132 *
133 * Read one UTF8 Char from @utf
134 *
135 * Returns the char value or -1 in case of error and update @len with the
136 * number of bytes used
137 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000138static int
Owen Taylor3473f882001-02-23 17:55:21 +0000139xmlGetUTF8Char(const unsigned char *utf, int *len) {
140 unsigned int c;
141
142 if (utf == NULL)
143 goto error;
144 if (len == NULL)
145 goto error;
146 if (*len < 1)
147 goto error;
148
149 c = utf[0];
150 if (c & 0x80) {
151 if (*len < 2)
152 goto error;
153 if ((utf[1] & 0xc0) != 0x80)
154 goto error;
155 if ((c & 0xe0) == 0xe0) {
156 if (*len < 3)
157 goto error;
158 if ((utf[2] & 0xc0) != 0x80)
159 goto error;
160 if ((c & 0xf0) == 0xf0) {
161 if (*len < 4)
162 goto error;
163 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
164 goto error;
165 *len = 4;
166 /* 4-byte code */
167 c = (utf[0] & 0x7) << 18;
168 c |= (utf[1] & 0x3f) << 12;
169 c |= (utf[2] & 0x3f) << 6;
170 c |= utf[3] & 0x3f;
171 } else {
172 /* 3-byte code */
173 *len = 3;
174 c = (utf[0] & 0xf) << 12;
175 c |= (utf[1] & 0x3f) << 6;
176 c |= utf[2] & 0x3f;
177 }
178 } else {
179 /* 2-byte code */
180 *len = 2;
181 c = (utf[0] & 0x1f) << 6;
182 c |= utf[1] & 0x3f;
183 }
184 } else {
185 /* 1-byte code */
186 *len = 1;
187 }
188 return(c);
189
190error:
191 *len = 0;
192 return(-1);
193}
194
195/**
196 * xmlCheckUTF8: Check utf-8 string for legality.
197 * @utf: Pointer to putative utf-8 encoded string.
198 *
199 * Checks @utf for being valid utf-8. @utf is assumed to be
200 * null-terminated. This function is not super-strict, as it will
201 * allow longer utf-8 sequences than necessary. Note that Java is
202 * capable of producing these sequences if provoked. Also note, this
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * routine checks for the 4-byte maximum size, but does not check for
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * 0x10ffff maximum value.
205 *
206 * Return value: true if @utf is valid.
207 **/
208int
209xmlCheckUTF8(const unsigned char *utf)
210{
211 int ix;
212 unsigned char c;
213
214 for (ix = 0; (c = utf[ix]);) {
215 if (c & 0x80) {
216 if ((utf[ix + 1] & 0xc0) != 0x80)
217 return(0);
218 if ((c & 0xe0) == 0xe0) {
219 if ((utf[ix + 2] & 0xc0) != 0x80)
220 return(0);
221 if ((c & 0xf0) == 0xf0) {
222 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
223 return(0);
224 ix += 4;
225 /* 4-byte code */
226 } else
227 /* 3-byte code */
228 ix += 3;
229 } else
230 /* 2-byte code */
231 ix += 2;
232 } else
233 /* 1-byte code */
234 ix++;
235 }
236 return(1);
237}
238
239/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000240 * xmlUTF8Strsize:
241 * @utf: a sequence of UTF-8 encoded bytes
242 * @len: the number of characters in the array
243 *
244 * storage size of an UTF8 string
245 *
246 * Returns the storage size of
247 * the first 'len' characters of ARRAY
248 *
249 */
250
251int
252xmlUTF8Strsize(const xmlChar *utf, int len) {
253 const xmlChar *ptr=utf;
254 xmlChar ch;
255
256 if (len <= 0)
257 return(0);
258
259 while ( len-- > 0) {
260 if ( !*ptr )
261 break;
262 if ( (ch = *ptr++) & 0x80)
263 while ( (ch<<=1) & 0x80 )
264 ptr++;
265 }
266 return (ptr - utf);
267}
268
269
270/**
271 * xmlUTF8Strndup:
272 * @utf: the input UTF8 *
273 * @len: the len of @utf (in chars)
274 *
275 * a strndup for array of UTF8's
276 *
277 * Returns a new UTF8 * or NULL
278 */
279xmlChar *
280xmlUTF8Strndup(const xmlChar *utf, int len) {
281 xmlChar *ret;
282 int i;
283
284 if ((utf == NULL) || (len < 0)) return(NULL);
285 i = xmlUTF8Strsize(utf, len);
286 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
287 if (ret == NULL) {
288 xmlGenericError(xmlGenericErrorContext,
289 "malloc of %ld byte failed\n",
290 (len + 1) * (long)sizeof(xmlChar));
291 return(NULL);
292 }
293 memcpy(ret, utf, i * sizeof(xmlChar));
294 ret[i] = 0;
295 return(ret);
296}
297
298/**
299 * xmlUTF8Strpos:
300 * @utf: the input UTF8 *
301 * @pos: the position of the desired UTF8 char (in chars)
302 *
303 * a function to provide the equivalent of fetching a
304 * character from a string array
305 *
306 * Returns a pointer to the UTF8 character or NULL
307 */
308xmlChar *
309xmlUTF8Strpos(const xmlChar *utf, int pos) {
310 xmlChar ch;
311
312 if (utf == NULL) return(NULL);
313 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
314 return(NULL);
315 while (pos--) {
316 if ((ch=*utf++) == 0) return(NULL);
317 if ( ch & 0x80 ) {
318 /* if not simple ascii, verify proper format */
319 if ( (ch & 0xc0) != 0xc0 )
320 return(NULL);
321 /* then skip over remaining bytes for this char */
322 while ( (ch <<= 1) & 0x80 )
323 if ( (*utf++ & 0xc0) != 0x80 )
324 return(NULL);
325 }
326 }
327 return((xmlChar *)utf);
328}
329
330/**
331 * xmlUTF8Strloc:
332 * @utf: the input UTF8 *
333 * @utfchar: the UTF8 character to be found
334 *
335 * a function to provide relative location of a UTF8 char
336 *
337 * Returns the relative character position of the desired char
338 * or -1 if not found
339 */
340int
341xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
342 int i, size;
343 xmlChar ch;
344
345 if (utf==NULL || utfchar==NULL) return -1;
346 size = xmlUTF8Strsize(utfchar, 1);
347 for(i=0; (ch=*utf) != 0; i++) {
348 if (xmlStrncmp(utf, utfchar, size)==0)
349 return(i);
350 utf++;
351 if ( ch & 0x80 ) {
352 /* if not simple ascii, verify proper format */
353 if ( (ch & 0xc0) != 0xc0 )
354 return(-1);
355 /* then skip over remaining bytes for this char */
356 while ( (ch <<= 1) & 0x80 )
357 if ( (*utf++ & 0xc0) != 0x80 )
358 return(-1);
359 }
360 }
361
362 return(-1);
363}
364/**
365 * xmlUTF8Strsub:
366 * @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard97ac1312001-05-30 19:14:17 +0000367 * @start: relative pos of first char
368 * @len: total number to copy
369 *
370 * Note: positions are given in units of UTF-8 chars
371 *
372 * Returns a pointer to a newly created string
373 * or NULL if any problem
374 */
375
376xmlChar *
377xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
378 int i;
379 xmlChar ch;
380
381 if (utf == NULL) return(NULL);
382 if (start < 0) return(NULL);
383 if (len < 0) return(NULL);
384
385 /*
386 * Skip over any leading chars
387 */
388 for (i = 0;i < start;i++) {
389 if ((ch=*utf++) == 0) return(NULL);
390 if ( ch & 0x80 ) {
391 /* if not simple ascii, verify proper format */
392 if ( (ch & 0xc0) != 0xc0 )
393 return(NULL);
394 /* then skip over remaining bytes for this char */
395 while ( (ch <<= 1) & 0x80 )
396 if ( (*utf++ & 0xc0) != 0x80 )
397 return(NULL);
398 }
399 }
400
401 return(xmlUTF8Strndup(utf, len));
402}
403
404/************************************************************************
405 * *
406 * Conversions To/From UTF8 encoding *
407 * *
408 ************************************************************************/
409
410/**
Owen Taylor3473f882001-02-23 17:55:21 +0000411 * asciiToUTF8:
412 * @out: a pointer to an array of bytes to store the result
413 * @outlen: the length of @out
414 * @in: a pointer to an array of ASCII chars
415 * @inlen: the length of @in
416 *
417 * Take a block of ASCII chars in and try to convert it to an UTF-8
418 * block of chars out.
419 * Returns 0 if success, or -1 otherwise
420 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000421 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000422 * The value of @outlen after return is the number of ocetes consumed.
423 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000424static int
Owen Taylor3473f882001-02-23 17:55:21 +0000425asciiToUTF8(unsigned char* out, int *outlen,
426 const unsigned char* in, int *inlen) {
427 unsigned char* outstart = out;
428 const unsigned char* base = in;
429 const unsigned char* processed = in;
430 unsigned char* outend = out + *outlen;
431 const unsigned char* inend;
432 unsigned int c;
433 int bits;
434
435 inend = in + (*inlen);
436 while ((in < inend) && (out - outstart + 5 < *outlen)) {
437 c= *in++;
438
439 /* assertion: c is a single UTF-4 value */
440 if (out >= outend)
441 break;
442 if (c < 0x80) { *out++= c; bits= -6; }
443 else {
444 *outlen = out - outstart;
445 *inlen = processed - base;
446 return(-1);
447 }
448
449 for ( ; bits >= 0; bits-= 6) {
450 if (out >= outend)
451 break;
452 *out++= ((c >> bits) & 0x3F) | 0x80;
453 }
454 processed = (const unsigned char*) in;
455 }
456 *outlen = out - outstart;
457 *inlen = processed - base;
458 return(0);
459}
460
461/**
462 * UTF8Toascii:
463 * @out: a pointer to an array of bytes to store the result
464 * @outlen: the length of @out
465 * @in: a pointer to an array of UTF-8 chars
466 * @inlen: the length of @in
467 *
468 * Take a block of UTF-8 chars in and try to convert it to an ASCII
469 * block of chars out.
470 *
471 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
472 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000473 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000474 * The value of @outlen after return is the number of ocetes consumed.
475 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000476static int
Owen Taylor3473f882001-02-23 17:55:21 +0000477UTF8Toascii(unsigned char* out, int *outlen,
478 const unsigned char* in, int *inlen) {
479 const unsigned char* processed = in;
480 const unsigned char* outend;
481 const unsigned char* outstart = out;
482 const unsigned char* instart = in;
483 const unsigned char* inend;
484 unsigned int c, d;
485 int trailing;
486
487 if (in == NULL) {
488 /*
489 * initialization nothing to do
490 */
491 *outlen = 0;
492 *inlen = 0;
493 return(0);
494 }
495 inend = in + (*inlen);
496 outend = out + (*outlen);
497 while (in < inend) {
498 d = *in++;
499 if (d < 0x80) { c= d; trailing= 0; }
500 else if (d < 0xC0) {
501 /* trailing byte in leading position */
502 *outlen = out - outstart;
503 *inlen = processed - instart;
504 return(-2);
505 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
506 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
507 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
508 else {
509 /* no chance for this in Ascii */
510 *outlen = out - outstart;
511 *inlen = processed - instart;
512 return(-2);
513 }
514
515 if (inend - in < trailing) {
516 break;
517 }
518
519 for ( ; trailing; trailing--) {
520 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
521 break;
522 c <<= 6;
523 c |= d & 0x3F;
524 }
525
526 /* assertion: c is a single UTF-4 value */
527 if (c < 0x80) {
528 if (out >= outend)
529 break;
530 *out++ = c;
531 } else {
532 /* no chance for this in Ascii */
533 *outlen = out - outstart;
534 *inlen = processed - instart;
535 return(-2);
536 }
537 processed = in;
538 }
539 *outlen = out - outstart;
540 *inlen = processed - instart;
541 return(0);
542}
543
544/**
545 * isolat1ToUTF8:
546 * @out: a pointer to an array of bytes to store the result
547 * @outlen: the length of @out
548 * @in: a pointer to an array of ISO Latin 1 chars
549 * @inlen: the length of @in
550 *
551 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
552 * block of chars out.
553 * Returns 0 if success, or -1 otherwise
554 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000555 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000556 * The value of @outlen after return is the number of ocetes consumed.
557 */
558int
559isolat1ToUTF8(unsigned char* out, int *outlen,
560 const unsigned char* in, int *inlen) {
561 unsigned char* outstart = out;
562 const unsigned char* base = in;
563 const unsigned char* processed = in;
564 unsigned char* outend = out + *outlen;
565 const unsigned char* inend;
566 unsigned int c;
Owen Taylor3473f882001-02-23 17:55:21 +0000567
568 inend = in + (*inlen);
Daniel Veillard02141ea2001-04-30 11:46:40 +0000569 while (in < inend) {
570 c = *in++;
Owen Taylor3473f882001-02-23 17:55:21 +0000571
Owen Taylor3473f882001-02-23 17:55:21 +0000572 if (out >= outend)
573 break;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000574
575 if (c < 0x80) {
576 *out++ = c;
577 processed++;
578 continue;
579 } else {
580 *out++= ((c >> 6) & 0x1F) | 0xC0;
Owen Taylor3473f882001-02-23 17:55:21 +0000581 if (out >= outend)
Daniel Veillard02141ea2001-04-30 11:46:40 +0000582 break;
583 *out++= (c & 0x3F) | 0x80;
584 processed++;
Owen Taylor3473f882001-02-23 17:55:21 +0000585 }
Owen Taylor3473f882001-02-23 17:55:21 +0000586 }
587 *outlen = out - outstart;
588 *inlen = processed - base;
589 return(0);
590}
591
592/**
593 * UTF8Toisolat1:
594 * @out: a pointer to an array of bytes to store the result
595 * @outlen: the length of @out
596 * @in: a pointer to an array of UTF-8 chars
597 * @inlen: the length of @in
598 *
599 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
600 * block of chars out.
601 *
602 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
603 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000604 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000605 * The value of @outlen after return is the number of ocetes consumed.
606 */
607int
608UTF8Toisolat1(unsigned char* out, int *outlen,
609 const unsigned char* in, int *inlen) {
610 const unsigned char* processed = in;
611 const unsigned char* outend;
612 const unsigned char* outstart = out;
613 const unsigned char* instart = in;
614 const unsigned char* inend;
615 unsigned int c, d;
616 int trailing;
617
618 if (in == NULL) {
619 /*
620 * initialization nothing to do
621 */
622 *outlen = 0;
623 *inlen = 0;
624 return(0);
625 }
626 inend = in + (*inlen);
627 outend = out + (*outlen);
628 while (in < inend) {
629 d = *in++;
630 if (d < 0x80) { c= d; trailing= 0; }
631 else if (d < 0xC0) {
632 /* trailing byte in leading position */
633 *outlen = out - outstart;
634 *inlen = processed - instart;
635 return(-2);
636 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
637 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
638 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
639 else {
640 /* no chance for this in IsoLat1 */
641 *outlen = out - outstart;
642 *inlen = processed - instart;
643 return(-2);
644 }
645
646 if (inend - in < trailing) {
647 break;
648 }
649
650 for ( ; trailing; trailing--) {
651 if (in >= inend)
652 break;
653 if (((d= *in++) & 0xC0) != 0x80) {
654 *outlen = out - outstart;
655 *inlen = processed - instart;
656 return(-2);
657 }
658 c <<= 6;
659 c |= d & 0x3F;
660 }
661
662 /* assertion: c is a single UTF-4 value */
663 if (c <= 0xFF) {
664 if (out >= outend)
665 break;
666 *out++ = c;
667 } else {
668 /* no chance for this in IsoLat1 */
669 *outlen = out - outstart;
670 *inlen = processed - instart;
671 return(-2);
672 }
673 processed = in;
674 }
675 *outlen = out - outstart;
676 *inlen = processed - instart;
677 return(0);
678}
679
680/**
681 * UTF16LEToUTF8:
682 * @out: a pointer to an array of bytes to store the result
683 * @outlen: the length of @out
684 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
685 * @inlenb: the length of @in in UTF-16LE chars
686 *
687 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000688 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000689 * is the same between the native type of this machine and the
690 * inputed one.
691 *
692 * Returns the number of byte written, or -1 by lack of space, or -2
693 * if the transcoding fails (for *in is not valid utf16 string)
694 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000695 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000696 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000697static int
Owen Taylor3473f882001-02-23 17:55:21 +0000698UTF16LEToUTF8(unsigned char* out, int *outlen,
699 const unsigned char* inb, int *inlenb)
700{
701 unsigned char* outstart = out;
702 const unsigned char* processed = inb;
703 unsigned char* outend = out + *outlen;
704 unsigned short* in = (unsigned short*) inb;
705 unsigned short* inend;
706 unsigned int c, d, inlen;
707 unsigned char *tmp;
708 int bits;
709
710 if ((*inlenb % 2) == 1)
711 (*inlenb)--;
712 inlen = *inlenb / 2;
713 inend = in + inlen;
714 while ((in < inend) && (out - outstart + 5 < *outlen)) {
715 if (xmlLittleEndian) {
716 c= *in++;
717 } else {
718 tmp = (unsigned char *) in;
719 c = *tmp++;
720 c = c | (((unsigned int)*tmp) << 8);
721 in++;
722 }
723 if ((c & 0xFC00) == 0xD800) { /* surrogates */
724 if (in >= inend) { /* (in > inend) shouldn't happens */
725 break;
726 }
727 if (xmlLittleEndian) {
728 d = *in++;
729 } else {
730 tmp = (unsigned char *) in;
731 d = *tmp++;
732 d = d | (((unsigned int)*tmp) << 8);
733 in++;
734 }
735 if ((d & 0xFC00) == 0xDC00) {
736 c &= 0x03FF;
737 c <<= 10;
738 c |= d & 0x03FF;
739 c += 0x10000;
740 }
741 else {
742 *outlen = out - outstart;
743 *inlenb = processed - inb;
744 return(-2);
745 }
746 }
747
748 /* assertion: c is a single UTF-4 value */
749 if (out >= outend)
750 break;
751 if (c < 0x80) { *out++= c; bits= -6; }
752 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
753 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
754 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
755
756 for ( ; bits >= 0; bits-= 6) {
757 if (out >= outend)
758 break;
759 *out++= ((c >> bits) & 0x3F) | 0x80;
760 }
761 processed = (const unsigned char*) in;
762 }
763 *outlen = out - outstart;
764 *inlenb = processed - inb;
765 return(0);
766}
767
768/**
769 * UTF8ToUTF16LE:
770 * @outb: a pointer to an array of bytes to store the result
771 * @outlen: the length of @outb
772 * @in: a pointer to an array of UTF-8 chars
773 * @inlen: the length of @in
774 *
775 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
776 * block of chars out.
777 *
778 * Returns the number of byte written, or -1 by lack of space, or -2
779 * if the transcoding failed.
780 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000781static int
Owen Taylor3473f882001-02-23 17:55:21 +0000782UTF8ToUTF16LE(unsigned char* outb, int *outlen,
783 const unsigned char* in, int *inlen)
784{
785 unsigned short* out = (unsigned short*) outb;
786 const unsigned char* processed = in;
787 unsigned short* outstart= out;
788 unsigned short* outend;
789 const unsigned char* inend= in+*inlen;
790 unsigned int c, d;
791 int trailing;
792 unsigned char *tmp;
793 unsigned short tmp1, tmp2;
794
795 if (in == NULL) {
796 /*
797 * initialization, add the Byte Order Mark
798 */
799 if (*outlen >= 2) {
800 outb[0] = 0xFF;
801 outb[1] = 0xFE;
802 *outlen = 2;
803 *inlen = 0;
804#ifdef DEBUG_ENCODING
805 xmlGenericError(xmlGenericErrorContext,
806 "Added FFFE Byte Order Mark\n");
807#endif
808 return(2);
809 }
810 *outlen = 0;
811 *inlen = 0;
812 return(0);
813 }
814 outend = out + (*outlen / 2);
815 while (in < inend) {
816 d= *in++;
817 if (d < 0x80) { c= d; trailing= 0; }
818 else if (d < 0xC0) {
819 /* trailing byte in leading position */
820 *outlen = (out - outstart) * 2;
821 *inlen = processed - in;
822 return(-2);
823 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
824 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
825 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
826 else {
827 /* no chance for this in UTF-16 */
828 *outlen = (out - outstart) * 2;
829 *inlen = processed - in;
830 return(-2);
831 }
832
833 if (inend - in < trailing) {
834 break;
835 }
836
837 for ( ; trailing; trailing--) {
838 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
839 break;
840 c <<= 6;
841 c |= d & 0x3F;
842 }
843
844 /* assertion: c is a single UTF-4 value */
845 if (c < 0x10000) {
846 if (out >= outend)
847 break;
848 if (xmlLittleEndian) {
849 *out++ = c;
850 } else {
851 tmp = (unsigned char *) out;
852 *tmp = c ;
853 *(tmp + 1) = c >> 8 ;
854 out++;
855 }
856 }
857 else if (c < 0x110000) {
858 if (out+1 >= outend)
859 break;
860 c -= 0x10000;
861 if (xmlLittleEndian) {
862 *out++ = 0xD800 | (c >> 10);
863 *out++ = 0xDC00 | (c & 0x03FF);
864 } else {
865 tmp1 = 0xD800 | (c >> 10);
866 tmp = (unsigned char *) out;
867 *tmp = (unsigned char) tmp1;
868 *(tmp + 1) = tmp1 >> 8;
869 out++;
870
871 tmp2 = 0xDC00 | (c & 0x03FF);
872 tmp = (unsigned char *) out;
873 *tmp = (unsigned char) tmp2;
874 *(tmp + 1) = tmp2 >> 8;
875 out++;
876 }
877 }
878 else
879 break;
880 processed = in;
881 }
882 *outlen = (out - outstart) * 2;
883 *inlen = processed - in;
884 return(0);
885}
886
887/**
888 * UTF16BEToUTF8:
889 * @out: a pointer to an array of bytes to store the result
890 * @outlen: the length of @out
891 * @inb: a pointer to an array of UTF-16 passwd as a byte array
892 * @inlenb: the length of @in in UTF-16 chars
893 *
894 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000895 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000896 * is the same between the native type of this machine and the
897 * inputed one.
898 *
899 * Returns the number of byte written, or -1 by lack of space, or -2
900 * if the transcoding fails (for *in is not valid utf16 string)
901 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000902 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000903 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000904static int
Owen Taylor3473f882001-02-23 17:55:21 +0000905UTF16BEToUTF8(unsigned char* out, int *outlen,
906 const unsigned char* inb, int *inlenb)
907{
908 unsigned char* outstart = out;
909 const unsigned char* processed = inb;
910 unsigned char* outend = out + *outlen;
911 unsigned short* in = (unsigned short*) inb;
912 unsigned short* inend;
913 unsigned int c, d, inlen;
914 unsigned char *tmp;
915 int bits;
916
917 if ((*inlenb % 2) == 1)
918 (*inlenb)--;
919 inlen = *inlenb / 2;
920 inend= in + inlen;
921 while (in < inend) {
922 if (xmlLittleEndian) {
923 tmp = (unsigned char *) in;
924 c = *tmp++;
925 c = c << 8;
926 c = c | (unsigned int) *tmp;
927 in++;
928 } else {
929 c= *in++;
930 }
931 if ((c & 0xFC00) == 0xD800) { /* surrogates */
932 if (in >= inend) { /* (in > inend) shouldn't happens */
933 *outlen = out - outstart;
934 *inlenb = processed - inb;
935 return(-2);
936 }
937 if (xmlLittleEndian) {
938 tmp = (unsigned char *) in;
939 d = *tmp++;
940 d = d << 8;
941 d = d | (unsigned int) *tmp;
942 in++;
943 } else {
944 d= *in++;
945 }
946 if ((d & 0xFC00) == 0xDC00) {
947 c &= 0x03FF;
948 c <<= 10;
949 c |= d & 0x03FF;
950 c += 0x10000;
951 }
952 else {
953 *outlen = out - outstart;
954 *inlenb = processed - inb;
955 return(-2);
956 }
957 }
958
959 /* assertion: c is a single UTF-4 value */
960 if (out >= outend)
961 break;
962 if (c < 0x80) { *out++= c; bits= -6; }
963 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
964 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
965 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
966
967 for ( ; bits >= 0; bits-= 6) {
968 if (out >= outend)
969 break;
970 *out++= ((c >> bits) & 0x3F) | 0x80;
971 }
972 processed = (const unsigned char*) in;
973 }
974 *outlen = out - outstart;
975 *inlenb = processed - inb;
976 return(0);
977}
978
979/**
980 * UTF8ToUTF16BE:
981 * @outb: a pointer to an array of bytes to store the result
982 * @outlen: the length of @outb
983 * @in: a pointer to an array of UTF-8 chars
984 * @inlen: the length of @in
985 *
986 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
987 * block of chars out.
988 *
989 * Returns the number of byte written, or -1 by lack of space, or -2
990 * if the transcoding failed.
991 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000992static int
Owen Taylor3473f882001-02-23 17:55:21 +0000993UTF8ToUTF16BE(unsigned char* outb, int *outlen,
994 const unsigned char* in, int *inlen)
995{
996 unsigned short* out = (unsigned short*) outb;
997 const unsigned char* processed = in;
998 unsigned short* outstart= out;
999 unsigned short* outend;
1000 const unsigned char* inend= in+*inlen;
1001 unsigned int c, d;
1002 int trailing;
1003 unsigned char *tmp;
1004 unsigned short tmp1, tmp2;
1005
1006 if (in == NULL) {
1007 /*
1008 * initialization, add the Byte Order Mark
1009 */
1010 if (*outlen >= 2) {
1011 outb[0] = 0xFE;
1012 outb[1] = 0xFF;
1013 *outlen = 2;
1014 *inlen = 0;
1015#ifdef DEBUG_ENCODING
1016 xmlGenericError(xmlGenericErrorContext,
1017 "Added FEFF Byte Order Mark\n");
1018#endif
1019 return(2);
1020 }
1021 *outlen = 0;
1022 *inlen = 0;
1023 return(0);
1024 }
1025 outend = out + (*outlen / 2);
1026 while (in < inend) {
1027 d= *in++;
1028 if (d < 0x80) { c= d; trailing= 0; }
1029 else if (d < 0xC0) {
1030 /* trailing byte in leading position */
1031 *outlen = out - outstart;
1032 *inlen = processed - in;
1033 return(-2);
1034 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1035 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1036 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1037 else {
1038 /* no chance for this in UTF-16 */
1039 *outlen = out - outstart;
1040 *inlen = processed - in;
1041 return(-2);
1042 }
1043
1044 if (inend - in < trailing) {
1045 break;
1046 }
1047
1048 for ( ; trailing; trailing--) {
1049 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1050 c <<= 6;
1051 c |= d & 0x3F;
1052 }
1053
1054 /* assertion: c is a single UTF-4 value */
1055 if (c < 0x10000) {
1056 if (out >= outend) break;
1057 if (xmlLittleEndian) {
1058 tmp = (unsigned char *) out;
1059 *tmp = c >> 8;
1060 *(tmp + 1) = c;
1061 out++;
1062 } else {
1063 *out++ = c;
1064 }
1065 }
1066 else if (c < 0x110000) {
1067 if (out+1 >= outend) break;
1068 c -= 0x10000;
1069 if (xmlLittleEndian) {
1070 tmp1 = 0xD800 | (c >> 10);
1071 tmp = (unsigned char *) out;
1072 *tmp = tmp1 >> 8;
1073 *(tmp + 1) = (unsigned char) tmp1;
1074 out++;
1075
1076 tmp2 = 0xDC00 | (c & 0x03FF);
1077 tmp = (unsigned char *) out;
1078 *tmp = tmp2 >> 8;
1079 *(tmp + 1) = (unsigned char) tmp2;
1080 out++;
1081 } else {
1082 *out++ = 0xD800 | (c >> 10);
1083 *out++ = 0xDC00 | (c & 0x03FF);
1084 }
1085 }
1086 else
1087 break;
1088 processed = in;
1089 }
1090 *outlen = (out - outstart) * 2;
1091 *inlen = processed - in;
1092 return(0);
1093}
1094
Daniel Veillard97ac1312001-05-30 19:14:17 +00001095/************************************************************************
1096 * *
1097 * Generic encoding handling routines *
1098 * *
1099 ************************************************************************/
1100
Owen Taylor3473f882001-02-23 17:55:21 +00001101/**
1102 * xmlDetectCharEncoding:
1103 * @in: a pointer to the first bytes of the XML entity, must be at least
1104 * 4 bytes long.
1105 * @len: pointer to the length of the buffer
1106 *
1107 * Guess the encoding of the entity using the first bytes of the entity content
1108 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1109 *
1110 * Returns one of the XML_CHAR_ENCODING_... values.
1111 */
1112xmlCharEncoding
1113xmlDetectCharEncoding(const unsigned char* in, int len)
1114{
1115 if (len >= 4) {
1116 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1117 (in[2] == 0x00) && (in[3] == 0x3C))
1118 return(XML_CHAR_ENCODING_UCS4BE);
1119 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1120 (in[2] == 0x00) && (in[3] == 0x00))
1121 return(XML_CHAR_ENCODING_UCS4LE);
1122 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1123 (in[2] == 0x3C) && (in[3] == 0x00))
1124 return(XML_CHAR_ENCODING_UCS4_2143);
1125 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1126 (in[2] == 0x00) && (in[3] == 0x00))
1127 return(XML_CHAR_ENCODING_UCS4_3412);
1128 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1129 (in[2] == 0xA7) && (in[3] == 0x94))
1130 return(XML_CHAR_ENCODING_EBCDIC);
1131 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1132 (in[2] == 0x78) && (in[3] == 0x6D))
1133 return(XML_CHAR_ENCODING_UTF8);
1134 }
Daniel Veillard87a764e2001-06-20 17:41:10 +00001135 if (len >= 3) {
1136 /*
1137 * Errata on XML-1.0 June 20 2001
1138 * We now allow an UTF8 encoded BOM
1139 */
1140 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1141 (in[2] == 0xBF))
1142 return(XML_CHAR_ENCODING_UTF8);
1143 }
Owen Taylor3473f882001-02-23 17:55:21 +00001144 if (len >= 2) {
1145 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1146 return(XML_CHAR_ENCODING_UTF16BE);
1147 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1148 return(XML_CHAR_ENCODING_UTF16LE);
1149 }
1150 return(XML_CHAR_ENCODING_NONE);
1151}
1152
1153/**
1154 * xmlCleanupEncodingAliases:
1155 *
1156 * Unregisters all aliases
1157 */
1158void
1159xmlCleanupEncodingAliases(void) {
1160 int i;
1161
1162 if (xmlCharEncodingAliases == NULL)
1163 return;
1164
1165 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1166 if (xmlCharEncodingAliases[i].name != NULL)
1167 xmlFree((char *) xmlCharEncodingAliases[i].name);
1168 if (xmlCharEncodingAliases[i].alias != NULL)
1169 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1170 }
1171 xmlCharEncodingAliasesNb = 0;
1172 xmlCharEncodingAliasesMax = 0;
1173 xmlFree(xmlCharEncodingAliases);
Daniel Veillard73c6e532002-01-08 13:15:33 +00001174 xmlCharEncodingAliases = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001175}
1176
1177/**
1178 * xmlGetEncodingAlias:
1179 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1180 *
1181 * Lookup an encoding name for the given alias.
1182 *
1183 * Returns NULL if not found the original name otherwise
1184 */
1185const char *
1186xmlGetEncodingAlias(const char *alias) {
1187 int i;
1188 char upper[100];
1189
1190 if (alias == NULL)
1191 return(NULL);
1192
1193 if (xmlCharEncodingAliases == NULL)
1194 return(NULL);
1195
1196 for (i = 0;i < 99;i++) {
1197 upper[i] = toupper(alias[i]);
1198 if (upper[i] == 0) break;
1199 }
1200 upper[i] = 0;
1201
1202 /*
1203 * Walk down the list looking for a definition of the alias
1204 */
1205 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1206 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1207 return(xmlCharEncodingAliases[i].name);
1208 }
1209 }
1210 return(NULL);
1211}
1212
1213/**
1214 * xmlAddEncodingAlias:
1215 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1216 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1217 *
1218 * Registers and alias @alias for an encoding named @name. Existing alias
1219 * will be overwritten.
1220 *
1221 * Returns 0 in case of success, -1 in case of error
1222 */
1223int
1224xmlAddEncodingAlias(const char *name, const char *alias) {
1225 int i;
1226 char upper[100];
1227
1228 if ((name == NULL) || (alias == NULL))
1229 return(-1);
1230
1231 for (i = 0;i < 99;i++) {
1232 upper[i] = toupper(alias[i]);
1233 if (upper[i] == 0) break;
1234 }
1235 upper[i] = 0;
1236
1237 if (xmlCharEncodingAliases == NULL) {
1238 xmlCharEncodingAliasesNb = 0;
1239 xmlCharEncodingAliasesMax = 20;
1240 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1241 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1242 if (xmlCharEncodingAliases == NULL)
1243 return(-1);
1244 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1245 xmlCharEncodingAliasesMax *= 2;
1246 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1247 xmlRealloc(xmlCharEncodingAliases,
1248 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1249 }
1250 /*
1251 * Walk down the list looking for a definition of the alias
1252 */
1253 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1254 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1255 /*
1256 * Replace the definition.
1257 */
1258 xmlFree((char *) xmlCharEncodingAliases[i].name);
1259 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1260 return(0);
1261 }
1262 }
1263 /*
1264 * Add the definition
1265 */
1266 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1267 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1268 xmlCharEncodingAliasesNb++;
1269 return(0);
1270}
1271
1272/**
1273 * xmlDelEncodingAlias:
1274 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1275 *
1276 * Unregisters an encoding alias @alias
1277 *
1278 * Returns 0 in case of success, -1 in case of error
1279 */
1280int
1281xmlDelEncodingAlias(const char *alias) {
1282 int i;
1283
1284 if (alias == NULL)
1285 return(-1);
1286
1287 if (xmlCharEncodingAliases == NULL)
1288 return(-1);
1289 /*
1290 * Walk down the list looking for a definition of the alias
1291 */
1292 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1293 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1294 xmlFree((char *) xmlCharEncodingAliases[i].name);
1295 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1296 xmlCharEncodingAliasesNb--;
1297 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1298 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1299 return(0);
1300 }
1301 }
1302 return(-1);
1303}
1304
1305/**
1306 * xmlParseCharEncoding:
1307 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1308 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001309 * Compare the string to the known encoding schemes already known. Note
Owen Taylor3473f882001-02-23 17:55:21 +00001310 * that the comparison is case insensitive accordingly to the section
1311 * [XML] 4.3.3 Character Encoding in Entities.
1312 *
1313 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1314 * if not recognized.
1315 */
1316xmlCharEncoding
1317xmlParseCharEncoding(const char* name)
1318{
1319 const char *alias;
1320 char upper[500];
1321 int i;
1322
1323 if (name == NULL)
1324 return(XML_CHAR_ENCODING_NONE);
1325
1326 /*
1327 * Do the alias resolution
1328 */
1329 alias = xmlGetEncodingAlias(name);
1330 if (alias != NULL)
1331 name = alias;
1332
1333 for (i = 0;i < 499;i++) {
1334 upper[i] = toupper(name[i]);
1335 if (upper[i] == 0) break;
1336 }
1337 upper[i] = 0;
1338
1339 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1340 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1341 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1342
1343 /*
1344 * NOTE: if we were able to parse this, the endianness of UTF16 is
1345 * already found and in use
1346 */
1347 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1348 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1349
1350 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1351 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1352 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1353
1354 /*
1355 * NOTE: if we were able to parse this, the endianness of UCS4 is
1356 * already found and in use
1357 */
1358 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1359 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1360 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1361
1362
1363 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1364 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1365 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1366
1367 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1368 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1369 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1370
1371 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1372 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1373 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1374 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1375 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1376 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1377 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1378
1379 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1380 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1381 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1382
1383#ifdef DEBUG_ENCODING
1384 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1385#endif
1386 return(XML_CHAR_ENCODING_ERROR);
1387}
1388
1389/**
1390 * xmlGetCharEncodingName:
1391 * @enc: the encoding
1392 *
1393 * The "canonical" name for XML encoding.
1394 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1395 * Section 4.3.3 Character Encoding in Entities
1396 *
1397 * Returns the canonical name for the given encoding
1398 */
1399
1400const char*
1401xmlGetCharEncodingName(xmlCharEncoding enc) {
1402 switch (enc) {
1403 case XML_CHAR_ENCODING_ERROR:
1404 return(NULL);
1405 case XML_CHAR_ENCODING_NONE:
1406 return(NULL);
1407 case XML_CHAR_ENCODING_UTF8:
1408 return("UTF-8");
1409 case XML_CHAR_ENCODING_UTF16LE:
1410 return("UTF-16");
1411 case XML_CHAR_ENCODING_UTF16BE:
1412 return("UTF-16");
1413 case XML_CHAR_ENCODING_EBCDIC:
1414 return("EBCDIC");
1415 case XML_CHAR_ENCODING_UCS4LE:
1416 return("ISO-10646-UCS-4");
1417 case XML_CHAR_ENCODING_UCS4BE:
1418 return("ISO-10646-UCS-4");
1419 case XML_CHAR_ENCODING_UCS4_2143:
1420 return("ISO-10646-UCS-4");
1421 case XML_CHAR_ENCODING_UCS4_3412:
1422 return("ISO-10646-UCS-4");
1423 case XML_CHAR_ENCODING_UCS2:
1424 return("ISO-10646-UCS-2");
1425 case XML_CHAR_ENCODING_8859_1:
1426 return("ISO-8859-1");
1427 case XML_CHAR_ENCODING_8859_2:
1428 return("ISO-8859-2");
1429 case XML_CHAR_ENCODING_8859_3:
1430 return("ISO-8859-3");
1431 case XML_CHAR_ENCODING_8859_4:
1432 return("ISO-8859-4");
1433 case XML_CHAR_ENCODING_8859_5:
1434 return("ISO-8859-5");
1435 case XML_CHAR_ENCODING_8859_6:
1436 return("ISO-8859-6");
1437 case XML_CHAR_ENCODING_8859_7:
1438 return("ISO-8859-7");
1439 case XML_CHAR_ENCODING_8859_8:
1440 return("ISO-8859-8");
1441 case XML_CHAR_ENCODING_8859_9:
1442 return("ISO-8859-9");
1443 case XML_CHAR_ENCODING_2022_JP:
1444 return("ISO-2022-JP");
1445 case XML_CHAR_ENCODING_SHIFT_JIS:
1446 return("Shift-JIS");
1447 case XML_CHAR_ENCODING_EUC_JP:
1448 return("EUC-JP");
1449 case XML_CHAR_ENCODING_ASCII:
1450 return(NULL);
1451 }
1452 return(NULL);
1453}
1454
Daniel Veillard97ac1312001-05-30 19:14:17 +00001455/************************************************************************
1456 * *
1457 * Char encoding handlers *
1458 * *
1459 ************************************************************************/
1460
Owen Taylor3473f882001-02-23 17:55:21 +00001461
1462/* the size should be growable, but it's not a big deal ... */
1463#define MAX_ENCODING_HANDLERS 50
1464static xmlCharEncodingHandlerPtr *handlers = NULL;
1465static int nbCharEncodingHandler = 0;
1466
1467/*
1468 * The default is UTF-8 for XML, that's also the default used for the
1469 * parser internals, so the default encoding handler is NULL
1470 */
1471
1472static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1473
1474/**
1475 * xmlNewCharEncodingHandler:
1476 * @name: the encoding name, in UTF-8 format (ASCII actually)
1477 * @input: the xmlCharEncodingInputFunc to read that encoding
1478 * @output: the xmlCharEncodingOutputFunc to write that encoding
1479 *
1480 * Create and registers an xmlCharEncodingHandler.
1481 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1482 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001483static xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001484xmlNewCharEncodingHandler(const char *name,
1485 xmlCharEncodingInputFunc input,
1486 xmlCharEncodingOutputFunc output) {
1487 xmlCharEncodingHandlerPtr handler;
1488 const char *alias;
1489 char upper[500];
1490 int i;
1491 char *up = 0;
1492
1493 /*
1494 * Do the alias resolution
1495 */
1496 alias = xmlGetEncodingAlias(name);
1497 if (alias != NULL)
1498 name = alias;
1499
1500 /*
1501 * Keep only the uppercase version of the encoding.
1502 */
1503 if (name == NULL) {
1504 xmlGenericError(xmlGenericErrorContext,
1505 "xmlNewCharEncodingHandler : no name !\n");
1506 return(NULL);
1507 }
1508 for (i = 0;i < 499;i++) {
1509 upper[i] = toupper(name[i]);
1510 if (upper[i] == 0) break;
1511 }
1512 upper[i] = 0;
1513 up = xmlMemStrdup(upper);
1514 if (up == NULL) {
1515 xmlGenericError(xmlGenericErrorContext,
1516 "xmlNewCharEncodingHandler : out of memory !\n");
1517 return(NULL);
1518 }
1519
1520 /*
1521 * allocate and fill-up an handler block.
1522 */
1523 handler = (xmlCharEncodingHandlerPtr)
1524 xmlMalloc(sizeof(xmlCharEncodingHandler));
1525 if (handler == NULL) {
1526 xmlGenericError(xmlGenericErrorContext,
1527 "xmlNewCharEncodingHandler : out of memory !\n");
1528 return(NULL);
1529 }
1530 handler->input = input;
1531 handler->output = output;
1532 handler->name = up;
1533
1534#ifdef LIBXML_ICONV_ENABLED
1535 handler->iconv_in = NULL;
1536 handler->iconv_out = NULL;
1537#endif /* LIBXML_ICONV_ENABLED */
1538
1539 /*
1540 * registers and returns the handler.
1541 */
1542 xmlRegisterCharEncodingHandler(handler);
1543#ifdef DEBUG_ENCODING
1544 xmlGenericError(xmlGenericErrorContext,
1545 "Registered encoding handler for %s\n", name);
1546#endif
1547 return(handler);
1548}
1549
1550/**
1551 * xmlInitCharEncodingHandlers:
1552 *
1553 * Initialize the char encoding support, it registers the default
1554 * encoding supported.
1555 * NOTE: while public, this function usually doesn't need to be called
1556 * in normal processing.
1557 */
1558void
1559xmlInitCharEncodingHandlers(void) {
1560 unsigned short int tst = 0x1234;
1561 unsigned char *ptr = (unsigned char *) &tst;
1562
1563 if (handlers != NULL) return;
1564
1565 handlers = (xmlCharEncodingHandlerPtr *)
1566 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1567
1568 if (*ptr == 0x12) xmlLittleEndian = 0;
1569 else if (*ptr == 0x34) xmlLittleEndian = 1;
1570 else xmlGenericError(xmlGenericErrorContext,
1571 "Odd problem at endianness detection\n");
1572
1573 if (handlers == NULL) {
1574 xmlGenericError(xmlGenericErrorContext,
1575 "xmlInitCharEncodingHandlers : out of memory !\n");
1576 return;
1577 }
1578 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1579 xmlUTF16LEHandler =
1580 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1581 xmlUTF16BEHandler =
1582 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1583 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1584 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001585 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001586#ifdef LIBXML_HTML_ENABLED
1587 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1588#endif
1589}
1590
1591/**
1592 * xmlCleanupCharEncodingHandlers:
1593 *
1594 * Cleanup the memory allocated for the char encoding support, it
1595 * unregisters all the encoding handlers and the aliases.
1596 */
1597void
1598xmlCleanupCharEncodingHandlers(void) {
1599 xmlCleanupEncodingAliases();
1600
1601 if (handlers == NULL) return;
1602
1603 for (;nbCharEncodingHandler > 0;) {
1604 nbCharEncodingHandler--;
1605 if (handlers[nbCharEncodingHandler] != NULL) {
1606 if (handlers[nbCharEncodingHandler]->name != NULL)
1607 xmlFree(handlers[nbCharEncodingHandler]->name);
1608 xmlFree(handlers[nbCharEncodingHandler]);
1609 }
1610 }
1611 xmlFree(handlers);
1612 handlers = NULL;
1613 nbCharEncodingHandler = 0;
1614 xmlDefaultCharEncodingHandler = NULL;
1615}
1616
1617/**
1618 * xmlRegisterCharEncodingHandler:
1619 * @handler: the xmlCharEncodingHandlerPtr handler block
1620 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001621 * Register the char encoding handler, surprising, isn't it ?
Owen Taylor3473f882001-02-23 17:55:21 +00001622 */
1623void
1624xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1625 if (handlers == NULL) xmlInitCharEncodingHandlers();
1626 if (handler == NULL) {
1627 xmlGenericError(xmlGenericErrorContext,
1628 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1629 return;
1630 }
1631
1632 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1633 xmlGenericError(xmlGenericErrorContext,
1634 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1635 xmlGenericError(xmlGenericErrorContext,
1636 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1637 return;
1638 }
1639 handlers[nbCharEncodingHandler++] = handler;
1640}
1641
1642/**
1643 * xmlGetCharEncodingHandler:
1644 * @enc: an xmlCharEncoding value.
1645 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001646 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001647 *
1648 * Returns the handler or NULL if not found
1649 */
1650xmlCharEncodingHandlerPtr
1651xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1652 xmlCharEncodingHandlerPtr handler;
1653
1654 if (handlers == NULL) xmlInitCharEncodingHandlers();
1655 switch (enc) {
1656 case XML_CHAR_ENCODING_ERROR:
1657 return(NULL);
1658 case XML_CHAR_ENCODING_NONE:
1659 return(NULL);
1660 case XML_CHAR_ENCODING_UTF8:
1661 return(NULL);
1662 case XML_CHAR_ENCODING_UTF16LE:
1663 return(xmlUTF16LEHandler);
1664 case XML_CHAR_ENCODING_UTF16BE:
1665 return(xmlUTF16BEHandler);
1666 case XML_CHAR_ENCODING_EBCDIC:
1667 handler = xmlFindCharEncodingHandler("EBCDIC");
1668 if (handler != NULL) return(handler);
1669 handler = xmlFindCharEncodingHandler("ebcdic");
1670 if (handler != NULL) return(handler);
1671 break;
1672 case XML_CHAR_ENCODING_UCS4BE:
1673 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1674 if (handler != NULL) return(handler);
1675 handler = xmlFindCharEncodingHandler("UCS-4");
1676 if (handler != NULL) return(handler);
1677 handler = xmlFindCharEncodingHandler("UCS4");
1678 if (handler != NULL) return(handler);
1679 break;
1680 case XML_CHAR_ENCODING_UCS4LE:
1681 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1682 if (handler != NULL) return(handler);
1683 handler = xmlFindCharEncodingHandler("UCS-4");
1684 if (handler != NULL) return(handler);
1685 handler = xmlFindCharEncodingHandler("UCS4");
1686 if (handler != NULL) return(handler);
1687 break;
1688 case XML_CHAR_ENCODING_UCS4_2143:
1689 break;
1690 case XML_CHAR_ENCODING_UCS4_3412:
1691 break;
1692 case XML_CHAR_ENCODING_UCS2:
1693 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1694 if (handler != NULL) return(handler);
1695 handler = xmlFindCharEncodingHandler("UCS-2");
1696 if (handler != NULL) return(handler);
1697 handler = xmlFindCharEncodingHandler("UCS2");
1698 if (handler != NULL) return(handler);
1699 break;
1700
1701 /*
1702 * We used to keep ISO Latin encodings native in the
1703 * generated data. This led to so many problems that
1704 * this has been removed. One can still change this
1705 * back by registering no-ops encoders for those
1706 */
1707 case XML_CHAR_ENCODING_8859_1:
1708 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1709 if (handler != NULL) return(handler);
1710 break;
1711 case XML_CHAR_ENCODING_8859_2:
1712 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1713 if (handler != NULL) return(handler);
1714 break;
1715 case XML_CHAR_ENCODING_8859_3:
1716 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1717 if (handler != NULL) return(handler);
1718 break;
1719 case XML_CHAR_ENCODING_8859_4:
1720 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1721 if (handler != NULL) return(handler);
1722 break;
1723 case XML_CHAR_ENCODING_8859_5:
1724 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1725 if (handler != NULL) return(handler);
1726 break;
1727 case XML_CHAR_ENCODING_8859_6:
1728 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1729 if (handler != NULL) return(handler);
1730 break;
1731 case XML_CHAR_ENCODING_8859_7:
1732 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1733 if (handler != NULL) return(handler);
1734 break;
1735 case XML_CHAR_ENCODING_8859_8:
1736 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1737 if (handler != NULL) return(handler);
1738 break;
1739 case XML_CHAR_ENCODING_8859_9:
1740 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1741 if (handler != NULL) return(handler);
1742 break;
1743
1744
1745 case XML_CHAR_ENCODING_2022_JP:
1746 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1747 if (handler != NULL) return(handler);
1748 break;
1749 case XML_CHAR_ENCODING_SHIFT_JIS:
1750 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1751 if (handler != NULL) return(handler);
1752 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1753 if (handler != NULL) return(handler);
1754 handler = xmlFindCharEncodingHandler("Shift_JIS");
1755 if (handler != NULL) return(handler);
1756 break;
1757 case XML_CHAR_ENCODING_EUC_JP:
1758 handler = xmlFindCharEncodingHandler("EUC-JP");
1759 if (handler != NULL) return(handler);
1760 break;
1761 default:
1762 break;
1763 }
1764
1765#ifdef DEBUG_ENCODING
1766 xmlGenericError(xmlGenericErrorContext,
1767 "No handler found for encoding %d\n", enc);
1768#endif
1769 return(NULL);
1770}
1771
1772/**
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001773 * xmlFindCharEncodingHandler:
1774 * @name: a string describing the char encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001775 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001776 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001777 *
1778 * Returns the handler or NULL if not found
1779 */
1780xmlCharEncodingHandlerPtr
1781xmlFindCharEncodingHandler(const char *name) {
1782 const char *nalias;
1783 const char *norig;
1784 xmlCharEncoding alias;
1785#ifdef LIBXML_ICONV_ENABLED
1786 xmlCharEncodingHandlerPtr enc;
1787 iconv_t icv_in, icv_out;
1788#endif /* LIBXML_ICONV_ENABLED */
1789 char upper[100];
1790 int i;
1791
1792 if (handlers == NULL) xmlInitCharEncodingHandlers();
1793 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1794 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1795
1796 /*
1797 * Do the alias resolution
1798 */
1799 norig = name;
1800 nalias = xmlGetEncodingAlias(name);
1801 if (nalias != NULL)
1802 name = nalias;
1803
1804 /*
1805 * Check first for directly registered encoding names
1806 */
1807 for (i = 0;i < 99;i++) {
1808 upper[i] = toupper(name[i]);
1809 if (upper[i] == 0) break;
1810 }
1811 upper[i] = 0;
1812
1813 for (i = 0;i < nbCharEncodingHandler; i++)
1814 if (!strcmp(upper, handlers[i]->name)) {
1815#ifdef DEBUG_ENCODING
1816 xmlGenericError(xmlGenericErrorContext,
1817 "Found registered handler for encoding %s\n", name);
1818#endif
1819 return(handlers[i]);
1820 }
1821
1822#ifdef LIBXML_ICONV_ENABLED
1823 /* check whether iconv can handle this */
1824 icv_in = iconv_open("UTF-8", name);
1825 icv_out = iconv_open(name, "UTF-8");
1826 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1827 enc = (xmlCharEncodingHandlerPtr)
1828 xmlMalloc(sizeof(xmlCharEncodingHandler));
1829 if (enc == NULL) {
1830 iconv_close(icv_in);
1831 iconv_close(icv_out);
1832 return(NULL);
1833 }
1834 enc->name = xmlMemStrdup(name);
1835 enc->input = NULL;
1836 enc->output = NULL;
1837 enc->iconv_in = icv_in;
1838 enc->iconv_out = icv_out;
1839#ifdef DEBUG_ENCODING
1840 xmlGenericError(xmlGenericErrorContext,
1841 "Found iconv handler for encoding %s\n", name);
1842#endif
1843 return enc;
1844 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1845 xmlGenericError(xmlGenericErrorContext,
1846 "iconv : problems with filters for '%s'\n", name);
1847 }
1848#endif /* LIBXML_ICONV_ENABLED */
1849
1850#ifdef DEBUG_ENCODING
1851 xmlGenericError(xmlGenericErrorContext,
1852 "No handler found for encoding %s\n", name);
1853#endif
1854
1855 /*
1856 * Fallback using the canonical names
1857 */
1858 alias = xmlParseCharEncoding(norig);
1859 if (alias != XML_CHAR_ENCODING_ERROR) {
1860 const char* canon;
1861 canon = xmlGetCharEncodingName(alias);
1862 if ((canon != NULL) && (strcmp(name, canon))) {
1863 return(xmlFindCharEncodingHandler(canon));
1864 }
1865 }
1866
1867 return(NULL);
1868}
1869
Daniel Veillard97ac1312001-05-30 19:14:17 +00001870/************************************************************************
1871 * *
1872 * ICONV based generic conversion functions *
1873 * *
1874 ************************************************************************/
1875
Owen Taylor3473f882001-02-23 17:55:21 +00001876#ifdef LIBXML_ICONV_ENABLED
1877/**
1878 * xmlIconvWrapper:
1879 * @cd: iconv converter data structure
1880 * @out: a pointer to an array of bytes to store the result
1881 * @outlen: the length of @out
1882 * @in: a pointer to an array of ISO Latin 1 chars
1883 * @inlen: the length of @in
1884 *
1885 * Returns 0 if success, or
1886 * -1 by lack of space, or
1887 * -2 if the transcoding fails (for *in is not valid utf8 string or
1888 * the result of transformation can't fit into the encoding we want), or
1889 * -3 if there the last byte can't form a single output char.
1890 *
1891 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001892 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001893 * The value of @outlen after return is the number of ocetes consumed.
1894 */
1895static int
1896xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001897 unsigned char *out, int *outlen,
1898 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001899
Daniel Veillard9403a042001-05-28 11:00:53 +00001900 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1901 const char *icv_in = (const char *) in;
1902 char *icv_out = (char *) out;
1903 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001904
Darin Adler699613b2001-07-27 22:47:14 +00001905 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard9403a042001-05-28 11:00:53 +00001906 if (in != NULL) {
1907 *inlen -= icv_inlen;
1908 *outlen -= icv_outlen;
1909 } else {
1910 *inlen = 0;
1911 *outlen = 0;
1912 }
1913 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001914#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00001915 if (errno == EILSEQ) {
1916 return -2;
1917 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001918#endif
1919#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00001920 if (errno == E2BIG) {
1921 return -1;
1922 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001923#endif
1924#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00001925 if (errno == EINVAL) {
1926 return -3;
1927 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001928#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00001929 {
1930 return -3;
1931 }
1932 }
1933 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001934}
1935#endif /* LIBXML_ICONV_ENABLED */
1936
Daniel Veillard97ac1312001-05-30 19:14:17 +00001937/************************************************************************
1938 * *
1939 * The real API used by libxml for on-the-fly conversion *
1940 * *
1941 ************************************************************************/
1942
Owen Taylor3473f882001-02-23 17:55:21 +00001943/**
1944 * xmlCharEncFirstLine:
1945 * @handler: char enconding transformation data structure
1946 * @out: an xmlBuffer for the output.
1947 * @in: an xmlBuffer for the input
1948 *
1949 * Front-end for the encoding handler input function, but handle only
1950 * the very first line, i.e. limit itself to 45 chars.
1951 *
1952 * Returns the number of byte written if success, or
1953 * -1 general error
1954 * -2 if the transcoding fails (for *in is not valid utf8 string or
1955 * the result of transformation can't fit into the encoding we want), or
1956 */
1957int
1958xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1959 xmlBufferPtr in) {
1960 int ret = -2;
1961 int written;
1962 int toconv;
1963
1964 if (handler == NULL) return(-1);
1965 if (out == NULL) return(-1);
1966 if (in == NULL) return(-1);
1967
1968 written = out->size - out->use;
1969 toconv = in->use;
1970 if (toconv * 2 >= written) {
1971 xmlBufferGrow(out, toconv);
1972 written = out->size - out->use - 1;
1973 }
1974
1975 /*
1976 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1977 * 45 chars should be sufficient to reach the end of the encoding
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001978 * declaration without going too far inside the document content.
Owen Taylor3473f882001-02-23 17:55:21 +00001979 */
1980 written = 45;
1981
1982 if (handler->input != NULL) {
1983 ret = handler->input(&out->content[out->use], &written,
1984 in->content, &toconv);
1985 xmlBufferShrink(in, toconv);
1986 out->use += written;
1987 out->content[out->use] = 0;
1988 }
1989#ifdef LIBXML_ICONV_ENABLED
1990 else if (handler->iconv_in != NULL) {
1991 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1992 &written, in->content, &toconv);
1993 xmlBufferShrink(in, toconv);
1994 out->use += written;
1995 out->content[out->use] = 0;
1996 if (ret == -1) ret = -3;
1997 }
1998#endif /* LIBXML_ICONV_ENABLED */
1999#ifdef DEBUG_ENCODING
2000 switch (ret) {
2001 case 0:
2002 xmlGenericError(xmlGenericErrorContext,
2003 "converted %d bytes to %d bytes of input\n",
2004 toconv, written);
2005 break;
2006 case -1:
2007 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2008 toconv, written, in->use);
2009 break;
2010 case -2:
2011 xmlGenericError(xmlGenericErrorContext,
2012 "input conversion failed due to input error\n");
2013 break;
2014 case -3:
2015 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2016 toconv, written, in->use);
2017 break;
2018 default:
2019 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2020 }
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002021#endif /* DEBUG_ENCODING */
Owen Taylor3473f882001-02-23 17:55:21 +00002022 /*
2023 * Ignore when input buffer is not on a boundary
2024 */
2025 if (ret == -3) ret = 0;
2026 if (ret == -1) ret = 0;
2027 return(ret);
2028}
2029
2030/**
2031 * xmlCharEncInFunc:
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002032 * @handler: char encoding transformation data structure
Owen Taylor3473f882001-02-23 17:55:21 +00002033 * @out: an xmlBuffer for the output.
2034 * @in: an xmlBuffer for the input
2035 *
2036 * Generic front-end for the encoding handler input function
2037 *
2038 * Returns the number of byte written if success, or
2039 * -1 general error
2040 * -2 if the transcoding fails (for *in is not valid utf8 string or
2041 * the result of transformation can't fit into the encoding we want), or
2042 */
2043int
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002044xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2045 xmlBufferPtr in)
2046{
Owen Taylor3473f882001-02-23 17:55:21 +00002047 int ret = -2;
2048 int written;
2049 int toconv;
2050
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002051 if (handler == NULL)
2052 return (-1);
2053 if (out == NULL)
2054 return (-1);
2055 if (in == NULL)
2056 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00002057
2058 toconv = in->use;
2059 if (toconv == 0)
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002060 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00002061 written = out->size - out->use;
2062 if (toconv * 2 >= written) {
2063 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002064 written = out->size - out->use - 1;
Owen Taylor3473f882001-02-23 17:55:21 +00002065 }
2066 if (handler->input != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002067 ret = handler->input(&out->content[out->use], &written,
2068 in->content, &toconv);
2069 xmlBufferShrink(in, toconv);
2070 out->use += written;
2071 out->content[out->use] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002072 }
2073#ifdef LIBXML_ICONV_ENABLED
2074 else if (handler->iconv_in != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002075 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2076 &written, in->content, &toconv);
2077 xmlBufferShrink(in, toconv);
2078 out->use += written;
2079 out->content[out->use] = 0;
2080 if (ret == -1)
2081 ret = -3;
Owen Taylor3473f882001-02-23 17:55:21 +00002082 }
2083#endif /* LIBXML_ICONV_ENABLED */
2084 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002085 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002086#ifdef DEBUG_ENCODING
2087 xmlGenericError(xmlGenericErrorContext,
2088 "converted %d bytes to %d bytes of input\n",
2089 toconv, written);
Owen Taylor3473f882001-02-23 17:55:21 +00002090#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002091 break;
2092 case -1:
2093#ifdef DEBUG_ENCODING
2094 xmlGenericError(xmlGenericErrorContext,
2095 "converted %d bytes to %d bytes of input, %d left\n",
2096 toconv, written, in->use);
2097#endif
2098 break;
2099 case -3:
2100#ifdef DEBUG_ENCODING
2101 xmlGenericError(xmlGenericErrorContext,
2102 "converted %d bytes to %d bytes of input, %d left\n",
2103 toconv, written, in->use);
2104#endif
2105 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002106 case -2:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002107 xmlGenericError(xmlGenericErrorContext,
2108 "input conversion failed due to input error\n");
2109 xmlGenericError(xmlGenericErrorContext,
2110 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2111 in->content[0], in->content[1],
2112 in->content[2], in->content[3]);
Owen Taylor3473f882001-02-23 17:55:21 +00002113 }
2114 /*
2115 * Ignore when input buffer is not on a boundary
2116 */
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002117 if (ret == -3)
2118 ret = 0;
2119 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00002120}
2121
2122/**
2123 * xmlCharEncOutFunc:
2124 * @handler: char enconding transformation data structure
2125 * @out: an xmlBuffer for the output.
2126 * @in: an xmlBuffer for the input
2127 *
2128 * Generic front-end for the encoding handler output function
2129 * a first call with @in == NULL has to be made firs to initiate the
2130 * output in case of non-stateless encoding needing to initiate their
2131 * state or the output (like the BOM in UTF16).
2132 * In case of UTF8 sequence conversion errors for the given encoder,
2133 * the content will be automatically remapped to a CharRef sequence.
2134 *
2135 * Returns the number of byte written if success, or
2136 * -1 general error
2137 * -2 if the transcoding fails (for *in is not valid utf8 string or
2138 * the result of transformation can't fit into the encoding we want), or
2139 */
2140int
2141xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2142 xmlBufferPtr in) {
2143 int ret = -2;
2144 int written;
2145 int writtentot = 0;
2146 int toconv;
2147 int output = 0;
2148
2149 if (handler == NULL) return(-1);
2150 if (out == NULL) return(-1);
2151
2152retry:
2153
2154 written = out->size - out->use;
2155
2156 /*
2157 * First specific handling of in = NULL, i.e. the initialization call
2158 */
2159 if (in == NULL) {
2160 toconv = 0;
2161 if (handler->output != NULL) {
2162 ret = handler->output(&out->content[out->use], &written,
2163 NULL, &toconv);
2164 out->use += written;
2165 out->content[out->use] = 0;
2166 }
2167#ifdef LIBXML_ICONV_ENABLED
2168 else if (handler->iconv_out != NULL) {
2169 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2170 &written, NULL, &toconv);
2171 out->use += written;
2172 out->content[out->use] = 0;
2173 }
2174#endif /* LIBXML_ICONV_ENABLED */
2175#ifdef DEBUG_ENCODING
2176 xmlGenericError(xmlGenericErrorContext,
2177 "initialized encoder\n");
2178#endif
2179 return(0);
2180 }
2181
2182 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002183 * Conversion itself.
Owen Taylor3473f882001-02-23 17:55:21 +00002184 */
2185 toconv = in->use;
2186 if (toconv == 0)
2187 return(0);
2188 if (toconv * 2 >= written) {
2189 xmlBufferGrow(out, toconv * 2);
2190 written = out->size - out->use - 1;
2191 }
2192 if (handler->output != NULL) {
2193 ret = handler->output(&out->content[out->use], &written,
2194 in->content, &toconv);
2195 xmlBufferShrink(in, toconv);
2196 out->use += written;
2197 writtentot += written;
2198 out->content[out->use] = 0;
2199 }
2200#ifdef LIBXML_ICONV_ENABLED
2201 else if (handler->iconv_out != NULL) {
2202 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2203 &written, in->content, &toconv);
2204 xmlBufferShrink(in, toconv);
2205 out->use += written;
2206 writtentot += written;
2207 out->content[out->use] = 0;
2208 if (ret == -1) {
2209 if (written > 0) {
2210 /*
2211 * Can be a limitation of iconv
2212 */
2213 goto retry;
2214 }
2215 ret = -3;
2216 }
2217 }
2218#endif /* LIBXML_ICONV_ENABLED */
2219 else {
2220 xmlGenericError(xmlGenericErrorContext,
2221 "xmlCharEncOutFunc: no output function !\n");
2222 return(-1);
2223 }
2224
2225 if (ret >= 0) output += ret;
2226
2227 /*
2228 * Attempt to handle error cases
2229 */
2230 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002231 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002232#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002233 xmlGenericError(xmlGenericErrorContext,
2234 "converted %d bytes to %d bytes of output\n",
2235 toconv, written);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002236#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002237 break;
2238 case -1:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002239#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002240 xmlGenericError(xmlGenericErrorContext,
2241 "output conversion failed by lack of space\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002242#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002243 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002244 case -3:
2245 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2246 toconv, written, in->use);
2247 break;
2248 case -2: {
2249 int len = in->use;
2250 const xmlChar *utf = (const xmlChar *) in->content;
2251 int cur;
2252
2253 cur = xmlGetUTF8Char(utf, &len);
2254 if (cur > 0) {
2255 xmlChar charref[20];
2256
2257#ifdef DEBUG_ENCODING
2258 xmlGenericError(xmlGenericErrorContext,
2259 "handling output conversion error\n");
2260 xmlGenericError(xmlGenericErrorContext,
2261 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2262 in->content[0], in->content[1],
2263 in->content[2], in->content[3]);
2264#endif
2265 /*
2266 * Removes the UTF8 sequence, and replace it by a charref
2267 * and continue the transcoding phase, hoping the error
2268 * did not mangle the encoder state.
2269 */
Daniel Veillard16698282001-09-14 10:29:27 +00002270 sprintf((char *) charref, "&#%d;", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002271 xmlBufferShrink(in, len);
2272 xmlBufferAddHead(in, charref, -1);
2273
2274 goto retry;
2275 } else {
2276 xmlGenericError(xmlGenericErrorContext,
2277 "output conversion failed due to conv error\n");
2278 xmlGenericError(xmlGenericErrorContext,
2279 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2280 in->content[0], in->content[1],
2281 in->content[2], in->content[3]);
2282 in->content[0] = ' ';
2283 }
2284 break;
2285 }
2286 }
2287 return(ret);
2288}
2289
2290/**
2291 * xmlCharEncCloseFunc:
2292 * @handler: char enconding transformation data structure
2293 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002294 * Generic front-end for encoding handler close function
Owen Taylor3473f882001-02-23 17:55:21 +00002295 *
2296 * Returns 0 if success, or -1 in case of error
2297 */
2298int
2299xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2300 int ret = 0;
2301 if (handler == NULL) return(-1);
2302 if (handler->name == NULL) return(-1);
2303#ifdef LIBXML_ICONV_ENABLED
2304 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002305 * Iconv handlers can be used only once, free the whole block.
Owen Taylor3473f882001-02-23 17:55:21 +00002306 * and the associated icon resources.
2307 */
2308 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2309 if (handler->name != NULL)
2310 xmlFree(handler->name);
2311 handler->name = NULL;
2312 if (handler->iconv_out != NULL) {
2313 if (iconv_close(handler->iconv_out))
2314 ret = -1;
2315 handler->iconv_out = NULL;
2316 }
2317 if (handler->iconv_in != NULL) {
2318 if (iconv_close(handler->iconv_in))
2319 ret = -1;
2320 handler->iconv_in = NULL;
2321 }
2322 xmlFree(handler);
2323 }
2324#endif /* LIBXML_ICONV_ENABLED */
2325#ifdef DEBUG_ENCODING
2326 if (ret)
2327 xmlGenericError(xmlGenericErrorContext,
2328 "failed to close the encoding handler\n");
2329 else
2330 xmlGenericError(xmlGenericErrorContext,
2331 "closed the encoding handler\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002332#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002333
Owen Taylor3473f882001-02-23 17:55:21 +00002334 return(ret);
2335}
2336