blob: de55e2e0f49cffed958c8a435916306e90c17230 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
Daniel Veillardc5d64342001-06-24 12:13:24 +000018 * daniel@veillard.com
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Daniel Veillard34ce8be2002-03-18 19:37:11 +000026#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000027#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000028
Owen Taylor3473f882001-02-23 17:55:21 +000029#include <string.h>
30
31#ifdef HAVE_CTYPE_H
32#include <ctype.h>
33#endif
34#ifdef HAVE_STDLIB_H
35#include <stdlib.h>
36#endif
Owen Taylor3473f882001-02-23 17:55:21 +000037#ifdef LIBXML_ICONV_ENABLED
38#ifdef HAVE_ERRNO_H
39#include <errno.h>
40#endif
41#endif
42#include <libxml/encoding.h>
43#include <libxml/xmlmemory.h>
44#ifdef LIBXML_HTML_ENABLED
45#include <libxml/HTMLparser.h>
46#endif
Daniel Veillard64a411c2001-10-15 12:32:07 +000047#include <libxml/globals.h>
Daniel Veillarda4617b82001-11-04 20:19:12 +000048#include <libxml/xmlerror.h>
Owen Taylor3473f882001-02-23 17:55:21 +000049
Daniel Veillard22090732001-07-16 00:06:07 +000050static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +000052
53typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
54typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
55struct _xmlCharEncodingAlias {
56 const char *name;
57 const char *alias;
58};
59
60static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
61static int xmlCharEncodingAliasesNb = 0;
62static int xmlCharEncodingAliasesMax = 0;
63
64#ifdef LIBXML_ICONV_ENABLED
65#if 0
66#define DEBUG_ENCODING /* Define this to get encoding traces */
67#endif
68#endif
69
70static int xmlLittleEndian = 1;
71
Daniel Veillard97ac1312001-05-30 19:14:17 +000072/************************************************************************
73 * *
74 * Generic UTF8 handling routines *
75 * *
76 * From rfc2044: encoding of the Unicode values on UTF-8: *
77 * *
78 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
79 * 0000 0000-0000 007F 0xxxxxxx *
80 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
81 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
82 * *
83 * I hope we won't use values > 0xFFFF anytime soon ! *
84 * *
85 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000086
87/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000088 * xmlUTF8Strlen:
89 * @utf: a sequence of UTF-8 encoded bytes
90 *
Daniel Veillard60087f32001-10-10 09:45:09 +000091 * compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillarde043ee12001-04-16 14:08:07 +000092 * checking of the content of the string.
93 *
94 * Returns the number of characters in the string or -1 in case of error
95 */
96int
Daniel Veillard97ac1312001-05-30 19:14:17 +000097xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +000098 int ret = 0;
99
100 if (utf == NULL)
101 return(-1);
102
103 while (*utf != 0) {
104 if (utf[0] & 0x80) {
105 if ((utf[1] & 0xc0) != 0x80)
106 return(-1);
107 if ((utf[0] & 0xe0) == 0xe0) {
108 if ((utf[2] & 0xc0) != 0x80)
109 return(-1);
110 if ((utf[0] & 0xf0) == 0xf0) {
111 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
112 return(-1);
113 utf += 4;
114 } else {
115 utf += 3;
116 }
117 } else {
118 utf += 2;
119 }
120 } else {
121 utf++;
122 }
123 ret++;
124 }
125 return(ret);
126}
127
128/**
Owen Taylor3473f882001-02-23 17:55:21 +0000129 * xmlGetUTF8Char:
130 * @utf: a sequence of UTF-8 encoded bytes
131 * @len: a pointer to @bytes len
132 *
133 * Read one UTF8 Char from @utf
134 *
135 * Returns the char value or -1 in case of error and update @len with the
136 * number of bytes used
137 */
Daniel Veillardf000f072002-10-22 14:28:17 +0000138int
Owen Taylor3473f882001-02-23 17:55:21 +0000139xmlGetUTF8Char(const unsigned char *utf, int *len) {
140 unsigned int c;
141
142 if (utf == NULL)
143 goto error;
144 if (len == NULL)
145 goto error;
146 if (*len < 1)
147 goto error;
148
149 c = utf[0];
150 if (c & 0x80) {
151 if (*len < 2)
152 goto error;
153 if ((utf[1] & 0xc0) != 0x80)
154 goto error;
155 if ((c & 0xe0) == 0xe0) {
156 if (*len < 3)
157 goto error;
158 if ((utf[2] & 0xc0) != 0x80)
159 goto error;
160 if ((c & 0xf0) == 0xf0) {
161 if (*len < 4)
162 goto error;
163 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
164 goto error;
165 *len = 4;
166 /* 4-byte code */
167 c = (utf[0] & 0x7) << 18;
168 c |= (utf[1] & 0x3f) << 12;
169 c |= (utf[2] & 0x3f) << 6;
170 c |= utf[3] & 0x3f;
171 } else {
172 /* 3-byte code */
173 *len = 3;
174 c = (utf[0] & 0xf) << 12;
175 c |= (utf[1] & 0x3f) << 6;
176 c |= utf[2] & 0x3f;
177 }
178 } else {
179 /* 2-byte code */
180 *len = 2;
181 c = (utf[0] & 0x1f) << 6;
182 c |= utf[1] & 0x3f;
183 }
184 } else {
185 /* 1-byte code */
186 *len = 1;
187 }
188 return(c);
189
190error:
191 *len = 0;
192 return(-1);
193}
194
195/**
Daniel Veillard01c13b52002-12-10 15:19:08 +0000196 * xmlCheckUTF8:
Owen Taylor3473f882001-02-23 17:55:21 +0000197 * @utf: Pointer to putative utf-8 encoded string.
198 *
199 * Checks @utf for being valid utf-8. @utf is assumed to be
200 * null-terminated. This function is not super-strict, as it will
201 * allow longer utf-8 sequences than necessary. Note that Java is
202 * capable of producing these sequences if provoked. Also note, this
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * routine checks for the 4-byte maximum size, but does not check for
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * 0x10ffff maximum value.
205 *
206 * Return value: true if @utf is valid.
207 **/
208int
209xmlCheckUTF8(const unsigned char *utf)
210{
211 int ix;
212 unsigned char c;
213
214 for (ix = 0; (c = utf[ix]);) {
215 if (c & 0x80) {
216 if ((utf[ix + 1] & 0xc0) != 0x80)
217 return(0);
218 if ((c & 0xe0) == 0xe0) {
219 if ((utf[ix + 2] & 0xc0) != 0x80)
220 return(0);
221 if ((c & 0xf0) == 0xf0) {
222 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
223 return(0);
224 ix += 4;
225 /* 4-byte code */
226 } else
227 /* 3-byte code */
228 ix += 3;
229 } else
230 /* 2-byte code */
231 ix += 2;
232 } else
233 /* 1-byte code */
234 ix++;
235 }
236 return(1);
237}
238
239/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000240 * xmlUTF8Strsize:
241 * @utf: a sequence of UTF-8 encoded bytes
242 * @len: the number of characters in the array
243 *
244 * storage size of an UTF8 string
245 *
246 * Returns the storage size of
247 * the first 'len' characters of ARRAY
248 *
249 */
250
251int
252xmlUTF8Strsize(const xmlChar *utf, int len) {
253 const xmlChar *ptr=utf;
254 xmlChar ch;
255
256 if (len <= 0)
257 return(0);
258
259 while ( len-- > 0) {
260 if ( !*ptr )
261 break;
262 if ( (ch = *ptr++) & 0x80)
263 while ( (ch<<=1) & 0x80 )
264 ptr++;
265 }
266 return (ptr - utf);
267}
268
269
270/**
271 * xmlUTF8Strndup:
272 * @utf: the input UTF8 *
273 * @len: the len of @utf (in chars)
274 *
275 * a strndup for array of UTF8's
276 *
277 * Returns a new UTF8 * or NULL
278 */
279xmlChar *
280xmlUTF8Strndup(const xmlChar *utf, int len) {
281 xmlChar *ret;
282 int i;
283
284 if ((utf == NULL) || (len < 0)) return(NULL);
285 i = xmlUTF8Strsize(utf, len);
286 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
287 if (ret == NULL) {
288 xmlGenericError(xmlGenericErrorContext,
289 "malloc of %ld byte failed\n",
290 (len + 1) * (long)sizeof(xmlChar));
291 return(NULL);
292 }
293 memcpy(ret, utf, i * sizeof(xmlChar));
294 ret[i] = 0;
295 return(ret);
296}
297
298/**
299 * xmlUTF8Strpos:
300 * @utf: the input UTF8 *
301 * @pos: the position of the desired UTF8 char (in chars)
302 *
303 * a function to provide the equivalent of fetching a
304 * character from a string array
305 *
306 * Returns a pointer to the UTF8 character or NULL
307 */
308xmlChar *
309xmlUTF8Strpos(const xmlChar *utf, int pos) {
310 xmlChar ch;
311
312 if (utf == NULL) return(NULL);
313 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
314 return(NULL);
315 while (pos--) {
316 if ((ch=*utf++) == 0) return(NULL);
317 if ( ch & 0x80 ) {
318 /* if not simple ascii, verify proper format */
319 if ( (ch & 0xc0) != 0xc0 )
320 return(NULL);
321 /* then skip over remaining bytes for this char */
322 while ( (ch <<= 1) & 0x80 )
323 if ( (*utf++ & 0xc0) != 0x80 )
324 return(NULL);
325 }
326 }
327 return((xmlChar *)utf);
328}
329
330/**
331 * xmlUTF8Strloc:
332 * @utf: the input UTF8 *
333 * @utfchar: the UTF8 character to be found
334 *
335 * a function to provide relative location of a UTF8 char
336 *
337 * Returns the relative character position of the desired char
338 * or -1 if not found
339 */
340int
341xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
342 int i, size;
343 xmlChar ch;
344
345 if (utf==NULL || utfchar==NULL) return -1;
346 size = xmlUTF8Strsize(utfchar, 1);
347 for(i=0; (ch=*utf) != 0; i++) {
348 if (xmlStrncmp(utf, utfchar, size)==0)
349 return(i);
350 utf++;
351 if ( ch & 0x80 ) {
352 /* if not simple ascii, verify proper format */
353 if ( (ch & 0xc0) != 0xc0 )
354 return(-1);
355 /* then skip over remaining bytes for this char */
356 while ( (ch <<= 1) & 0x80 )
357 if ( (*utf++ & 0xc0) != 0x80 )
358 return(-1);
359 }
360 }
361
362 return(-1);
363}
364/**
365 * xmlUTF8Strsub:
366 * @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard97ac1312001-05-30 19:14:17 +0000367 * @start: relative pos of first char
368 * @len: total number to copy
369 *
370 * Note: positions are given in units of UTF-8 chars
371 *
372 * Returns a pointer to a newly created string
373 * or NULL if any problem
374 */
375
376xmlChar *
377xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
378 int i;
379 xmlChar ch;
380
381 if (utf == NULL) return(NULL);
382 if (start < 0) return(NULL);
383 if (len < 0) return(NULL);
384
385 /*
386 * Skip over any leading chars
387 */
388 for (i = 0;i < start;i++) {
389 if ((ch=*utf++) == 0) return(NULL);
390 if ( ch & 0x80 ) {
391 /* if not simple ascii, verify proper format */
392 if ( (ch & 0xc0) != 0xc0 )
393 return(NULL);
394 /* then skip over remaining bytes for this char */
395 while ( (ch <<= 1) & 0x80 )
396 if ( (*utf++ & 0xc0) != 0x80 )
397 return(NULL);
398 }
399 }
400
401 return(xmlUTF8Strndup(utf, len));
402}
403
404/************************************************************************
405 * *
406 * Conversions To/From UTF8 encoding *
407 * *
408 ************************************************************************/
409
410/**
Owen Taylor3473f882001-02-23 17:55:21 +0000411 * asciiToUTF8:
412 * @out: a pointer to an array of bytes to store the result
413 * @outlen: the length of @out
414 * @in: a pointer to an array of ASCII chars
415 * @inlen: the length of @in
416 *
417 * Take a block of ASCII chars in and try to convert it to an UTF-8
418 * block of chars out.
419 * Returns 0 if success, or -1 otherwise
420 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000421 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000422 * The value of @outlen after return is the number of ocetes consumed.
423 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000424static int
Owen Taylor3473f882001-02-23 17:55:21 +0000425asciiToUTF8(unsigned char* out, int *outlen,
426 const unsigned char* in, int *inlen) {
427 unsigned char* outstart = out;
428 const unsigned char* base = in;
429 const unsigned char* processed = in;
430 unsigned char* outend = out + *outlen;
431 const unsigned char* inend;
432 unsigned int c;
433 int bits;
434
435 inend = in + (*inlen);
436 while ((in < inend) && (out - outstart + 5 < *outlen)) {
437 c= *in++;
438
439 /* assertion: c is a single UTF-4 value */
440 if (out >= outend)
441 break;
442 if (c < 0x80) { *out++= c; bits= -6; }
443 else {
444 *outlen = out - outstart;
445 *inlen = processed - base;
446 return(-1);
447 }
448
449 for ( ; bits >= 0; bits-= 6) {
450 if (out >= outend)
451 break;
452 *out++= ((c >> bits) & 0x3F) | 0x80;
453 }
454 processed = (const unsigned char*) in;
455 }
456 *outlen = out - outstart;
457 *inlen = processed - base;
458 return(0);
459}
460
461/**
462 * UTF8Toascii:
463 * @out: a pointer to an array of bytes to store the result
464 * @outlen: the length of @out
465 * @in: a pointer to an array of UTF-8 chars
466 * @inlen: the length of @in
467 *
468 * Take a block of UTF-8 chars in and try to convert it to an ASCII
469 * block of chars out.
470 *
471 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
472 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000473 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000474 * The value of @outlen after return is the number of ocetes consumed.
475 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000476static int
Owen Taylor3473f882001-02-23 17:55:21 +0000477UTF8Toascii(unsigned char* out, int *outlen,
478 const unsigned char* in, int *inlen) {
479 const unsigned char* processed = in;
480 const unsigned char* outend;
481 const unsigned char* outstart = out;
482 const unsigned char* instart = in;
483 const unsigned char* inend;
484 unsigned int c, d;
485 int trailing;
486
487 if (in == NULL) {
488 /*
489 * initialization nothing to do
490 */
491 *outlen = 0;
492 *inlen = 0;
493 return(0);
494 }
495 inend = in + (*inlen);
496 outend = out + (*outlen);
497 while (in < inend) {
498 d = *in++;
499 if (d < 0x80) { c= d; trailing= 0; }
500 else if (d < 0xC0) {
501 /* trailing byte in leading position */
502 *outlen = out - outstart;
503 *inlen = processed - instart;
504 return(-2);
505 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
506 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
507 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
508 else {
509 /* no chance for this in Ascii */
510 *outlen = out - outstart;
511 *inlen = processed - instart;
512 return(-2);
513 }
514
515 if (inend - in < trailing) {
516 break;
517 }
518
519 for ( ; trailing; trailing--) {
520 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
521 break;
522 c <<= 6;
523 c |= d & 0x3F;
524 }
525
526 /* assertion: c is a single UTF-4 value */
527 if (c < 0x80) {
528 if (out >= outend)
529 break;
530 *out++ = c;
531 } else {
532 /* no chance for this in Ascii */
533 *outlen = out - outstart;
534 *inlen = processed - instart;
535 return(-2);
536 }
537 processed = in;
538 }
539 *outlen = out - outstart;
540 *inlen = processed - instart;
541 return(0);
542}
543
544/**
545 * isolat1ToUTF8:
546 * @out: a pointer to an array of bytes to store the result
547 * @outlen: the length of @out
548 * @in: a pointer to an array of ISO Latin 1 chars
549 * @inlen: the length of @in
550 *
551 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
552 * block of chars out.
553 * Returns 0 if success, or -1 otherwise
554 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000555 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000556 * The value of @outlen after return is the number of ocetes consumed.
557 */
558int
559isolat1ToUTF8(unsigned char* out, int *outlen,
560 const unsigned char* in, int *inlen) {
561 unsigned char* outstart = out;
562 const unsigned char* base = in;
Owen Taylor3473f882001-02-23 17:55:21 +0000563 unsigned char* outend = out + *outlen;
564 const unsigned char* inend;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000565 const unsigned char* instop;
566 xmlChar c = *in;
Owen Taylor3473f882001-02-23 17:55:21 +0000567
568 inend = in + (*inlen);
Daniel Veillarde72c7562002-05-31 09:47:30 +0000569 instop = inend;
570
571 while (in < inend && out < outend - 1) {
572 if (c >= 0x80) {
Daniel Veillarddb552912002-03-21 13:27:59 +0000573 *out++= ((c >> 6) & 0x1F) | 0xC0;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000574 *out++= (c & 0x3F) | 0x80;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000575 ++in;
576 c = *in;
577 }
578 if (instop - in > outend - out) instop = in + (outend - out);
579 while (c < 0x80 && in < instop) {
580 *out++ = c;
581 ++in;
582 c = *in;
583 }
584 }
585 if (in < inend && out < outend && c < 0x80) {
586 *out++ = c;
587 ++in;
Owen Taylor3473f882001-02-23 17:55:21 +0000588 }
589 *outlen = out - outstart;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000590 *inlen = in - base;
Owen Taylor3473f882001-02-23 17:55:21 +0000591 return(0);
592}
593
Daniel Veillard81601f92003-01-14 13:42:37 +0000594/**
595 * UTF8ToUTF8:
596 * @out: a pointer to an array of bytes to store the result
597 * @outlen: the length of @out
598 * @inb: a pointer to an array of UTF-8 chars
599 * @inlenb: the length of @in in UTF-8 chars
600 *
601 * No op copy operation for UTF8 handling.
602 *
603 * Returns the number of byte written, or -1 by lack of space, or -2
604 * if the transcoding fails (for *in is not valid utf16 string)
605 * The value of *inlen after return is the number of octets consumed
606 * as the return value is positive, else unpredictable.
607 */
608static int
609UTF8ToUTF8(unsigned char* out, int *outlen,
610 const unsigned char* inb, int *inlenb)
611{
612 int len;
613
614 if ((out == NULL) || (inb == NULL) || (outlen == NULL) || (inlenb == NULL))
615 return(-1);
616 if (*outlen > *inlenb) {
617 len = *inlenb;
618 } else {
619 len = *outlen;
620 }
621 if (len < 0)
622 return(-1);
623
624 memcpy(out, inb, len);
625
626 *outlen = len;
627 *inlenb = len;
628 return(0);
629}
630
Daniel Veillarde72c7562002-05-31 09:47:30 +0000631
Owen Taylor3473f882001-02-23 17:55:21 +0000632/**
633 * UTF8Toisolat1:
634 * @out: a pointer to an array of bytes to store the result
635 * @outlen: the length of @out
636 * @in: a pointer to an array of UTF-8 chars
637 * @inlen: the length of @in
638 *
639 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
640 * block of chars out.
641 *
642 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
643 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000644 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000645 * The value of @outlen after return is the number of ocetes consumed.
646 */
647int
648UTF8Toisolat1(unsigned char* out, int *outlen,
649 const unsigned char* in, int *inlen) {
650 const unsigned char* processed = in;
651 const unsigned char* outend;
652 const unsigned char* outstart = out;
653 const unsigned char* instart = in;
654 const unsigned char* inend;
655 unsigned int c, d;
656 int trailing;
657
658 if (in == NULL) {
659 /*
660 * initialization nothing to do
661 */
662 *outlen = 0;
663 *inlen = 0;
664 return(0);
665 }
666 inend = in + (*inlen);
667 outend = out + (*outlen);
668 while (in < inend) {
669 d = *in++;
670 if (d < 0x80) { c= d; trailing= 0; }
671 else if (d < 0xC0) {
672 /* trailing byte in leading position */
673 *outlen = out - outstart;
674 *inlen = processed - instart;
675 return(-2);
676 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
677 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
678 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
679 else {
680 /* no chance for this in IsoLat1 */
681 *outlen = out - outstart;
682 *inlen = processed - instart;
683 return(-2);
684 }
685
686 if (inend - in < trailing) {
687 break;
688 }
689
690 for ( ; trailing; trailing--) {
691 if (in >= inend)
692 break;
693 if (((d= *in++) & 0xC0) != 0x80) {
694 *outlen = out - outstart;
695 *inlen = processed - instart;
696 return(-2);
697 }
698 c <<= 6;
699 c |= d & 0x3F;
700 }
701
702 /* assertion: c is a single UTF-4 value */
703 if (c <= 0xFF) {
704 if (out >= outend)
705 break;
706 *out++ = c;
707 } else {
708 /* no chance for this in IsoLat1 */
709 *outlen = out - outstart;
710 *inlen = processed - instart;
711 return(-2);
712 }
713 processed = in;
714 }
715 *outlen = out - outstart;
716 *inlen = processed - instart;
717 return(0);
718}
719
720/**
721 * UTF16LEToUTF8:
722 * @out: a pointer to an array of bytes to store the result
723 * @outlen: the length of @out
724 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
725 * @inlenb: the length of @in in UTF-16LE chars
726 *
727 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000728 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000729 * is the same between the native type of this machine and the
730 * inputed one.
731 *
732 * Returns the number of byte written, or -1 by lack of space, or -2
733 * if the transcoding fails (for *in is not valid utf16 string)
734 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000735 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000736 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000737static int
Owen Taylor3473f882001-02-23 17:55:21 +0000738UTF16LEToUTF8(unsigned char* out, int *outlen,
739 const unsigned char* inb, int *inlenb)
740{
741 unsigned char* outstart = out;
742 const unsigned char* processed = inb;
743 unsigned char* outend = out + *outlen;
744 unsigned short* in = (unsigned short*) inb;
745 unsigned short* inend;
746 unsigned int c, d, inlen;
747 unsigned char *tmp;
748 int bits;
749
750 if ((*inlenb % 2) == 1)
751 (*inlenb)--;
752 inlen = *inlenb / 2;
753 inend = in + inlen;
754 while ((in < inend) && (out - outstart + 5 < *outlen)) {
755 if (xmlLittleEndian) {
756 c= *in++;
757 } else {
758 tmp = (unsigned char *) in;
759 c = *tmp++;
760 c = c | (((unsigned int)*tmp) << 8);
761 in++;
762 }
763 if ((c & 0xFC00) == 0xD800) { /* surrogates */
764 if (in >= inend) { /* (in > inend) shouldn't happens */
765 break;
766 }
767 if (xmlLittleEndian) {
768 d = *in++;
769 } else {
770 tmp = (unsigned char *) in;
771 d = *tmp++;
772 d = d | (((unsigned int)*tmp) << 8);
773 in++;
774 }
775 if ((d & 0xFC00) == 0xDC00) {
776 c &= 0x03FF;
777 c <<= 10;
778 c |= d & 0x03FF;
779 c += 0x10000;
780 }
781 else {
782 *outlen = out - outstart;
783 *inlenb = processed - inb;
784 return(-2);
785 }
786 }
787
788 /* assertion: c is a single UTF-4 value */
789 if (out >= outend)
790 break;
791 if (c < 0x80) { *out++= c; bits= -6; }
792 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
793 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
794 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
795
796 for ( ; bits >= 0; bits-= 6) {
797 if (out >= outend)
798 break;
799 *out++= ((c >> bits) & 0x3F) | 0x80;
800 }
801 processed = (const unsigned char*) in;
802 }
803 *outlen = out - outstart;
804 *inlenb = processed - inb;
805 return(0);
806}
807
808/**
809 * UTF8ToUTF16LE:
810 * @outb: a pointer to an array of bytes to store the result
811 * @outlen: the length of @outb
812 * @in: a pointer to an array of UTF-8 chars
813 * @inlen: the length of @in
814 *
815 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
816 * block of chars out.
817 *
818 * Returns the number of byte written, or -1 by lack of space, or -2
819 * if the transcoding failed.
820 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000821static int
Owen Taylor3473f882001-02-23 17:55:21 +0000822UTF8ToUTF16LE(unsigned char* outb, int *outlen,
823 const unsigned char* in, int *inlen)
824{
825 unsigned short* out = (unsigned short*) outb;
826 const unsigned char* processed = in;
827 unsigned short* outstart= out;
828 unsigned short* outend;
829 const unsigned char* inend= in+*inlen;
830 unsigned int c, d;
831 int trailing;
832 unsigned char *tmp;
833 unsigned short tmp1, tmp2;
834
835 if (in == NULL) {
836 /*
837 * initialization, add the Byte Order Mark
838 */
839 if (*outlen >= 2) {
840 outb[0] = 0xFF;
841 outb[1] = 0xFE;
842 *outlen = 2;
843 *inlen = 0;
844#ifdef DEBUG_ENCODING
845 xmlGenericError(xmlGenericErrorContext,
846 "Added FFFE Byte Order Mark\n");
847#endif
848 return(2);
849 }
850 *outlen = 0;
851 *inlen = 0;
852 return(0);
853 }
854 outend = out + (*outlen / 2);
855 while (in < inend) {
856 d= *in++;
857 if (d < 0x80) { c= d; trailing= 0; }
858 else if (d < 0xC0) {
859 /* trailing byte in leading position */
860 *outlen = (out - outstart) * 2;
861 *inlen = processed - in;
862 return(-2);
863 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
864 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
865 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
866 else {
867 /* no chance for this in UTF-16 */
868 *outlen = (out - outstart) * 2;
869 *inlen = processed - in;
870 return(-2);
871 }
872
873 if (inend - in < trailing) {
874 break;
875 }
876
877 for ( ; trailing; trailing--) {
878 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
879 break;
880 c <<= 6;
881 c |= d & 0x3F;
882 }
883
884 /* assertion: c is a single UTF-4 value */
885 if (c < 0x10000) {
886 if (out >= outend)
887 break;
888 if (xmlLittleEndian) {
889 *out++ = c;
890 } else {
891 tmp = (unsigned char *) out;
892 *tmp = c ;
893 *(tmp + 1) = c >> 8 ;
894 out++;
895 }
896 }
897 else if (c < 0x110000) {
898 if (out+1 >= outend)
899 break;
900 c -= 0x10000;
901 if (xmlLittleEndian) {
902 *out++ = 0xD800 | (c >> 10);
903 *out++ = 0xDC00 | (c & 0x03FF);
904 } else {
905 tmp1 = 0xD800 | (c >> 10);
906 tmp = (unsigned char *) out;
907 *tmp = (unsigned char) tmp1;
908 *(tmp + 1) = tmp1 >> 8;
909 out++;
910
911 tmp2 = 0xDC00 | (c & 0x03FF);
912 tmp = (unsigned char *) out;
913 *tmp = (unsigned char) tmp2;
914 *(tmp + 1) = tmp2 >> 8;
915 out++;
916 }
917 }
918 else
919 break;
920 processed = in;
921 }
922 *outlen = (out - outstart) * 2;
923 *inlen = processed - in;
924 return(0);
925}
926
927/**
928 * UTF16BEToUTF8:
929 * @out: a pointer to an array of bytes to store the result
930 * @outlen: the length of @out
931 * @inb: a pointer to an array of UTF-16 passwd as a byte array
932 * @inlenb: the length of @in in UTF-16 chars
933 *
934 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000935 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000936 * is the same between the native type of this machine and the
937 * inputed one.
938 *
939 * Returns the number of byte written, or -1 by lack of space, or -2
940 * if the transcoding fails (for *in is not valid utf16 string)
941 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000942 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000943 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000944static int
Owen Taylor3473f882001-02-23 17:55:21 +0000945UTF16BEToUTF8(unsigned char* out, int *outlen,
946 const unsigned char* inb, int *inlenb)
947{
948 unsigned char* outstart = out;
949 const unsigned char* processed = inb;
950 unsigned char* outend = out + *outlen;
951 unsigned short* in = (unsigned short*) inb;
952 unsigned short* inend;
953 unsigned int c, d, inlen;
954 unsigned char *tmp;
955 int bits;
956
957 if ((*inlenb % 2) == 1)
958 (*inlenb)--;
959 inlen = *inlenb / 2;
960 inend= in + inlen;
961 while (in < inend) {
962 if (xmlLittleEndian) {
963 tmp = (unsigned char *) in;
964 c = *tmp++;
965 c = c << 8;
966 c = c | (unsigned int) *tmp;
967 in++;
968 } else {
969 c= *in++;
970 }
971 if ((c & 0xFC00) == 0xD800) { /* surrogates */
972 if (in >= inend) { /* (in > inend) shouldn't happens */
973 *outlen = out - outstart;
974 *inlenb = processed - inb;
975 return(-2);
976 }
977 if (xmlLittleEndian) {
978 tmp = (unsigned char *) in;
979 d = *tmp++;
980 d = d << 8;
981 d = d | (unsigned int) *tmp;
982 in++;
983 } else {
984 d= *in++;
985 }
986 if ((d & 0xFC00) == 0xDC00) {
987 c &= 0x03FF;
988 c <<= 10;
989 c |= d & 0x03FF;
990 c += 0x10000;
991 }
992 else {
993 *outlen = out - outstart;
994 *inlenb = processed - inb;
995 return(-2);
996 }
997 }
998
999 /* assertion: c is a single UTF-4 value */
1000 if (out >= outend)
1001 break;
1002 if (c < 0x80) { *out++= c; bits= -6; }
1003 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
1004 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
1005 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1006
1007 for ( ; bits >= 0; bits-= 6) {
1008 if (out >= outend)
1009 break;
1010 *out++= ((c >> bits) & 0x3F) | 0x80;
1011 }
1012 processed = (const unsigned char*) in;
1013 }
1014 *outlen = out - outstart;
1015 *inlenb = processed - inb;
1016 return(0);
1017}
1018
1019/**
1020 * UTF8ToUTF16BE:
1021 * @outb: a pointer to an array of bytes to store the result
1022 * @outlen: the length of @outb
1023 * @in: a pointer to an array of UTF-8 chars
1024 * @inlen: the length of @in
1025 *
1026 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1027 * block of chars out.
1028 *
1029 * Returns the number of byte written, or -1 by lack of space, or -2
1030 * if the transcoding failed.
1031 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001032static int
Owen Taylor3473f882001-02-23 17:55:21 +00001033UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1034 const unsigned char* in, int *inlen)
1035{
1036 unsigned short* out = (unsigned short*) outb;
1037 const unsigned char* processed = in;
1038 unsigned short* outstart= out;
1039 unsigned short* outend;
1040 const unsigned char* inend= in+*inlen;
1041 unsigned int c, d;
1042 int trailing;
1043 unsigned char *tmp;
1044 unsigned short tmp1, tmp2;
1045
1046 if (in == NULL) {
1047 /*
1048 * initialization, add the Byte Order Mark
1049 */
1050 if (*outlen >= 2) {
1051 outb[0] = 0xFE;
1052 outb[1] = 0xFF;
1053 *outlen = 2;
1054 *inlen = 0;
1055#ifdef DEBUG_ENCODING
1056 xmlGenericError(xmlGenericErrorContext,
1057 "Added FEFF Byte Order Mark\n");
1058#endif
1059 return(2);
1060 }
1061 *outlen = 0;
1062 *inlen = 0;
1063 return(0);
1064 }
1065 outend = out + (*outlen / 2);
1066 while (in < inend) {
1067 d= *in++;
1068 if (d < 0x80) { c= d; trailing= 0; }
1069 else if (d < 0xC0) {
1070 /* trailing byte in leading position */
1071 *outlen = out - outstart;
1072 *inlen = processed - in;
1073 return(-2);
1074 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1075 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1076 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1077 else {
1078 /* no chance for this in UTF-16 */
1079 *outlen = out - outstart;
1080 *inlen = processed - in;
1081 return(-2);
1082 }
1083
1084 if (inend - in < trailing) {
1085 break;
1086 }
1087
1088 for ( ; trailing; trailing--) {
1089 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1090 c <<= 6;
1091 c |= d & 0x3F;
1092 }
1093
1094 /* assertion: c is a single UTF-4 value */
1095 if (c < 0x10000) {
1096 if (out >= outend) break;
1097 if (xmlLittleEndian) {
1098 tmp = (unsigned char *) out;
1099 *tmp = c >> 8;
1100 *(tmp + 1) = c;
1101 out++;
1102 } else {
1103 *out++ = c;
1104 }
1105 }
1106 else if (c < 0x110000) {
1107 if (out+1 >= outend) break;
1108 c -= 0x10000;
1109 if (xmlLittleEndian) {
1110 tmp1 = 0xD800 | (c >> 10);
1111 tmp = (unsigned char *) out;
1112 *tmp = tmp1 >> 8;
1113 *(tmp + 1) = (unsigned char) tmp1;
1114 out++;
1115
1116 tmp2 = 0xDC00 | (c & 0x03FF);
1117 tmp = (unsigned char *) out;
1118 *tmp = tmp2 >> 8;
1119 *(tmp + 1) = (unsigned char) tmp2;
1120 out++;
1121 } else {
1122 *out++ = 0xD800 | (c >> 10);
1123 *out++ = 0xDC00 | (c & 0x03FF);
1124 }
1125 }
1126 else
1127 break;
1128 processed = in;
1129 }
1130 *outlen = (out - outstart) * 2;
1131 *inlen = processed - in;
1132 return(0);
1133}
1134
Daniel Veillard97ac1312001-05-30 19:14:17 +00001135/************************************************************************
1136 * *
1137 * Generic encoding handling routines *
1138 * *
1139 ************************************************************************/
1140
Owen Taylor3473f882001-02-23 17:55:21 +00001141/**
1142 * xmlDetectCharEncoding:
1143 * @in: a pointer to the first bytes of the XML entity, must be at least
1144 * 4 bytes long.
1145 * @len: pointer to the length of the buffer
1146 *
1147 * Guess the encoding of the entity using the first bytes of the entity content
1148 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1149 *
1150 * Returns one of the XML_CHAR_ENCODING_... values.
1151 */
1152xmlCharEncoding
1153xmlDetectCharEncoding(const unsigned char* in, int len)
1154{
1155 if (len >= 4) {
1156 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1157 (in[2] == 0x00) && (in[3] == 0x3C))
1158 return(XML_CHAR_ENCODING_UCS4BE);
1159 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1160 (in[2] == 0x00) && (in[3] == 0x00))
1161 return(XML_CHAR_ENCODING_UCS4LE);
1162 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1163 (in[2] == 0x3C) && (in[3] == 0x00))
1164 return(XML_CHAR_ENCODING_UCS4_2143);
1165 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1166 (in[2] == 0x00) && (in[3] == 0x00))
1167 return(XML_CHAR_ENCODING_UCS4_3412);
1168 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1169 (in[2] == 0xA7) && (in[3] == 0x94))
1170 return(XML_CHAR_ENCODING_EBCDIC);
1171 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1172 (in[2] == 0x78) && (in[3] == 0x6D))
1173 return(XML_CHAR_ENCODING_UTF8);
1174 }
Daniel Veillard87a764e2001-06-20 17:41:10 +00001175 if (len >= 3) {
1176 /*
1177 * Errata on XML-1.0 June 20 2001
1178 * We now allow an UTF8 encoded BOM
1179 */
1180 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1181 (in[2] == 0xBF))
1182 return(XML_CHAR_ENCODING_UTF8);
1183 }
Owen Taylor3473f882001-02-23 17:55:21 +00001184 if (len >= 2) {
1185 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1186 return(XML_CHAR_ENCODING_UTF16BE);
1187 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1188 return(XML_CHAR_ENCODING_UTF16LE);
1189 }
1190 return(XML_CHAR_ENCODING_NONE);
1191}
1192
1193/**
1194 * xmlCleanupEncodingAliases:
1195 *
1196 * Unregisters all aliases
1197 */
1198void
1199xmlCleanupEncodingAliases(void) {
1200 int i;
1201
1202 if (xmlCharEncodingAliases == NULL)
1203 return;
1204
1205 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1206 if (xmlCharEncodingAliases[i].name != NULL)
1207 xmlFree((char *) xmlCharEncodingAliases[i].name);
1208 if (xmlCharEncodingAliases[i].alias != NULL)
1209 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1210 }
1211 xmlCharEncodingAliasesNb = 0;
1212 xmlCharEncodingAliasesMax = 0;
1213 xmlFree(xmlCharEncodingAliases);
Daniel Veillard73c6e532002-01-08 13:15:33 +00001214 xmlCharEncodingAliases = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001215}
1216
1217/**
1218 * xmlGetEncodingAlias:
1219 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1220 *
1221 * Lookup an encoding name for the given alias.
1222 *
1223 * Returns NULL if not found the original name otherwise
1224 */
1225const char *
1226xmlGetEncodingAlias(const char *alias) {
1227 int i;
1228 char upper[100];
1229
1230 if (alias == NULL)
1231 return(NULL);
1232
1233 if (xmlCharEncodingAliases == NULL)
1234 return(NULL);
1235
1236 for (i = 0;i < 99;i++) {
1237 upper[i] = toupper(alias[i]);
1238 if (upper[i] == 0) break;
1239 }
1240 upper[i] = 0;
1241
1242 /*
1243 * Walk down the list looking for a definition of the alias
1244 */
1245 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1246 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1247 return(xmlCharEncodingAliases[i].name);
1248 }
1249 }
1250 return(NULL);
1251}
1252
1253/**
1254 * xmlAddEncodingAlias:
1255 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1256 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1257 *
1258 * Registers and alias @alias for an encoding named @name. Existing alias
1259 * will be overwritten.
1260 *
1261 * Returns 0 in case of success, -1 in case of error
1262 */
1263int
1264xmlAddEncodingAlias(const char *name, const char *alias) {
1265 int i;
1266 char upper[100];
1267
1268 if ((name == NULL) || (alias == NULL))
1269 return(-1);
1270
1271 for (i = 0;i < 99;i++) {
1272 upper[i] = toupper(alias[i]);
1273 if (upper[i] == 0) break;
1274 }
1275 upper[i] = 0;
1276
1277 if (xmlCharEncodingAliases == NULL) {
1278 xmlCharEncodingAliasesNb = 0;
1279 xmlCharEncodingAliasesMax = 20;
1280 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1281 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1282 if (xmlCharEncodingAliases == NULL)
1283 return(-1);
1284 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1285 xmlCharEncodingAliasesMax *= 2;
1286 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1287 xmlRealloc(xmlCharEncodingAliases,
1288 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1289 }
1290 /*
1291 * Walk down the list looking for a definition of the alias
1292 */
1293 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1294 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1295 /*
1296 * Replace the definition.
1297 */
1298 xmlFree((char *) xmlCharEncodingAliases[i].name);
1299 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1300 return(0);
1301 }
1302 }
1303 /*
1304 * Add the definition
1305 */
1306 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1307 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1308 xmlCharEncodingAliasesNb++;
1309 return(0);
1310}
1311
1312/**
1313 * xmlDelEncodingAlias:
1314 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1315 *
1316 * Unregisters an encoding alias @alias
1317 *
1318 * Returns 0 in case of success, -1 in case of error
1319 */
1320int
1321xmlDelEncodingAlias(const char *alias) {
1322 int i;
1323
1324 if (alias == NULL)
1325 return(-1);
1326
1327 if (xmlCharEncodingAliases == NULL)
1328 return(-1);
1329 /*
1330 * Walk down the list looking for a definition of the alias
1331 */
1332 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1333 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1334 xmlFree((char *) xmlCharEncodingAliases[i].name);
1335 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1336 xmlCharEncodingAliasesNb--;
1337 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1338 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1339 return(0);
1340 }
1341 }
1342 return(-1);
1343}
1344
1345/**
1346 * xmlParseCharEncoding:
1347 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1348 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001349 * Compare the string to the known encoding schemes already known. Note
Owen Taylor3473f882001-02-23 17:55:21 +00001350 * that the comparison is case insensitive accordingly to the section
1351 * [XML] 4.3.3 Character Encoding in Entities.
1352 *
1353 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1354 * if not recognized.
1355 */
1356xmlCharEncoding
1357xmlParseCharEncoding(const char* name)
1358{
1359 const char *alias;
1360 char upper[500];
1361 int i;
1362
1363 if (name == NULL)
1364 return(XML_CHAR_ENCODING_NONE);
1365
1366 /*
1367 * Do the alias resolution
1368 */
1369 alias = xmlGetEncodingAlias(name);
1370 if (alias != NULL)
1371 name = alias;
1372
1373 for (i = 0;i < 499;i++) {
1374 upper[i] = toupper(name[i]);
1375 if (upper[i] == 0) break;
1376 }
1377 upper[i] = 0;
1378
1379 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1380 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1381 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1382
1383 /*
1384 * NOTE: if we were able to parse this, the endianness of UTF16 is
1385 * already found and in use
1386 */
1387 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1388 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1389
1390 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1391 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1392 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1393
1394 /*
1395 * NOTE: if we were able to parse this, the endianness of UCS4 is
1396 * already found and in use
1397 */
1398 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1399 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1400 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1401
1402
1403 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1404 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1405 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1406
1407 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1408 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1409 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1410
1411 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1412 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1413 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1414 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1415 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1416 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1417 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1418
1419 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1420 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1421 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1422
1423#ifdef DEBUG_ENCODING
1424 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1425#endif
1426 return(XML_CHAR_ENCODING_ERROR);
1427}
1428
1429/**
1430 * xmlGetCharEncodingName:
1431 * @enc: the encoding
1432 *
1433 * The "canonical" name for XML encoding.
1434 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1435 * Section 4.3.3 Character Encoding in Entities
1436 *
1437 * Returns the canonical name for the given encoding
1438 */
1439
1440const char*
1441xmlGetCharEncodingName(xmlCharEncoding enc) {
1442 switch (enc) {
1443 case XML_CHAR_ENCODING_ERROR:
1444 return(NULL);
1445 case XML_CHAR_ENCODING_NONE:
1446 return(NULL);
1447 case XML_CHAR_ENCODING_UTF8:
1448 return("UTF-8");
1449 case XML_CHAR_ENCODING_UTF16LE:
1450 return("UTF-16");
1451 case XML_CHAR_ENCODING_UTF16BE:
1452 return("UTF-16");
1453 case XML_CHAR_ENCODING_EBCDIC:
1454 return("EBCDIC");
1455 case XML_CHAR_ENCODING_UCS4LE:
1456 return("ISO-10646-UCS-4");
1457 case XML_CHAR_ENCODING_UCS4BE:
1458 return("ISO-10646-UCS-4");
1459 case XML_CHAR_ENCODING_UCS4_2143:
1460 return("ISO-10646-UCS-4");
1461 case XML_CHAR_ENCODING_UCS4_3412:
1462 return("ISO-10646-UCS-4");
1463 case XML_CHAR_ENCODING_UCS2:
1464 return("ISO-10646-UCS-2");
1465 case XML_CHAR_ENCODING_8859_1:
1466 return("ISO-8859-1");
1467 case XML_CHAR_ENCODING_8859_2:
1468 return("ISO-8859-2");
1469 case XML_CHAR_ENCODING_8859_3:
1470 return("ISO-8859-3");
1471 case XML_CHAR_ENCODING_8859_4:
1472 return("ISO-8859-4");
1473 case XML_CHAR_ENCODING_8859_5:
1474 return("ISO-8859-5");
1475 case XML_CHAR_ENCODING_8859_6:
1476 return("ISO-8859-6");
1477 case XML_CHAR_ENCODING_8859_7:
1478 return("ISO-8859-7");
1479 case XML_CHAR_ENCODING_8859_8:
1480 return("ISO-8859-8");
1481 case XML_CHAR_ENCODING_8859_9:
1482 return("ISO-8859-9");
1483 case XML_CHAR_ENCODING_2022_JP:
1484 return("ISO-2022-JP");
1485 case XML_CHAR_ENCODING_SHIFT_JIS:
1486 return("Shift-JIS");
1487 case XML_CHAR_ENCODING_EUC_JP:
1488 return("EUC-JP");
1489 case XML_CHAR_ENCODING_ASCII:
1490 return(NULL);
1491 }
1492 return(NULL);
1493}
1494
Daniel Veillard97ac1312001-05-30 19:14:17 +00001495/************************************************************************
1496 * *
1497 * Char encoding handlers *
1498 * *
1499 ************************************************************************/
1500
Owen Taylor3473f882001-02-23 17:55:21 +00001501
1502/* the size should be growable, but it's not a big deal ... */
1503#define MAX_ENCODING_HANDLERS 50
1504static xmlCharEncodingHandlerPtr *handlers = NULL;
1505static int nbCharEncodingHandler = 0;
1506
1507/*
1508 * The default is UTF-8 for XML, that's also the default used for the
1509 * parser internals, so the default encoding handler is NULL
1510 */
1511
1512static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1513
1514/**
1515 * xmlNewCharEncodingHandler:
1516 * @name: the encoding name, in UTF-8 format (ASCII actually)
1517 * @input: the xmlCharEncodingInputFunc to read that encoding
1518 * @output: the xmlCharEncodingOutputFunc to write that encoding
1519 *
1520 * Create and registers an xmlCharEncodingHandler.
Daniel Veillard6f46f6c2002-08-01 12:22:24 +00001521 *
Owen Taylor3473f882001-02-23 17:55:21 +00001522 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1523 */
Daniel Veillard6f46f6c2002-08-01 12:22:24 +00001524xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001525xmlNewCharEncodingHandler(const char *name,
1526 xmlCharEncodingInputFunc input,
1527 xmlCharEncodingOutputFunc output) {
1528 xmlCharEncodingHandlerPtr handler;
1529 const char *alias;
1530 char upper[500];
1531 int i;
1532 char *up = 0;
1533
1534 /*
1535 * Do the alias resolution
1536 */
1537 alias = xmlGetEncodingAlias(name);
1538 if (alias != NULL)
1539 name = alias;
1540
1541 /*
1542 * Keep only the uppercase version of the encoding.
1543 */
1544 if (name == NULL) {
1545 xmlGenericError(xmlGenericErrorContext,
1546 "xmlNewCharEncodingHandler : no name !\n");
1547 return(NULL);
1548 }
1549 for (i = 0;i < 499;i++) {
1550 upper[i] = toupper(name[i]);
1551 if (upper[i] == 0) break;
1552 }
1553 upper[i] = 0;
1554 up = xmlMemStrdup(upper);
1555 if (up == NULL) {
1556 xmlGenericError(xmlGenericErrorContext,
1557 "xmlNewCharEncodingHandler : out of memory !\n");
1558 return(NULL);
1559 }
1560
1561 /*
1562 * allocate and fill-up an handler block.
1563 */
1564 handler = (xmlCharEncodingHandlerPtr)
1565 xmlMalloc(sizeof(xmlCharEncodingHandler));
1566 if (handler == NULL) {
1567 xmlGenericError(xmlGenericErrorContext,
1568 "xmlNewCharEncodingHandler : out of memory !\n");
1569 return(NULL);
1570 }
1571 handler->input = input;
1572 handler->output = output;
1573 handler->name = up;
1574
1575#ifdef LIBXML_ICONV_ENABLED
1576 handler->iconv_in = NULL;
1577 handler->iconv_out = NULL;
1578#endif /* LIBXML_ICONV_ENABLED */
1579
1580 /*
1581 * registers and returns the handler.
1582 */
1583 xmlRegisterCharEncodingHandler(handler);
1584#ifdef DEBUG_ENCODING
1585 xmlGenericError(xmlGenericErrorContext,
1586 "Registered encoding handler for %s\n", name);
1587#endif
1588 return(handler);
1589}
1590
1591/**
1592 * xmlInitCharEncodingHandlers:
1593 *
1594 * Initialize the char encoding support, it registers the default
1595 * encoding supported.
1596 * NOTE: while public, this function usually doesn't need to be called
1597 * in normal processing.
1598 */
1599void
1600xmlInitCharEncodingHandlers(void) {
1601 unsigned short int tst = 0x1234;
1602 unsigned char *ptr = (unsigned char *) &tst;
1603
1604 if (handlers != NULL) return;
1605
1606 handlers = (xmlCharEncodingHandlerPtr *)
1607 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1608
1609 if (*ptr == 0x12) xmlLittleEndian = 0;
1610 else if (*ptr == 0x34) xmlLittleEndian = 1;
1611 else xmlGenericError(xmlGenericErrorContext,
1612 "Odd problem at endianness detection\n");
1613
1614 if (handlers == NULL) {
1615 xmlGenericError(xmlGenericErrorContext,
1616 "xmlInitCharEncodingHandlers : out of memory !\n");
1617 return;
1618 }
Daniel Veillard81601f92003-01-14 13:42:37 +00001619 xmlNewCharEncodingHandler("UTF-8", UTF8ToUTF8, UTF8ToUTF8);
Owen Taylor3473f882001-02-23 17:55:21 +00001620 xmlUTF16LEHandler =
1621 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1622 xmlUTF16BEHandler =
1623 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1624 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1625 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001626 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001627#ifdef LIBXML_HTML_ENABLED
1628 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1629#endif
1630}
1631
1632/**
1633 * xmlCleanupCharEncodingHandlers:
1634 *
1635 * Cleanup the memory allocated for the char encoding support, it
1636 * unregisters all the encoding handlers and the aliases.
1637 */
1638void
1639xmlCleanupCharEncodingHandlers(void) {
1640 xmlCleanupEncodingAliases();
1641
1642 if (handlers == NULL) return;
1643
1644 for (;nbCharEncodingHandler > 0;) {
1645 nbCharEncodingHandler--;
1646 if (handlers[nbCharEncodingHandler] != NULL) {
1647 if (handlers[nbCharEncodingHandler]->name != NULL)
1648 xmlFree(handlers[nbCharEncodingHandler]->name);
1649 xmlFree(handlers[nbCharEncodingHandler]);
1650 }
1651 }
1652 xmlFree(handlers);
1653 handlers = NULL;
1654 nbCharEncodingHandler = 0;
1655 xmlDefaultCharEncodingHandler = NULL;
1656}
1657
1658/**
1659 * xmlRegisterCharEncodingHandler:
1660 * @handler: the xmlCharEncodingHandlerPtr handler block
1661 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001662 * Register the char encoding handler, surprising, isn't it ?
Owen Taylor3473f882001-02-23 17:55:21 +00001663 */
1664void
1665xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1666 if (handlers == NULL) xmlInitCharEncodingHandlers();
1667 if (handler == NULL) {
1668 xmlGenericError(xmlGenericErrorContext,
1669 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1670 return;
1671 }
1672
1673 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1674 xmlGenericError(xmlGenericErrorContext,
1675 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1676 xmlGenericError(xmlGenericErrorContext,
1677 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1678 return;
1679 }
1680 handlers[nbCharEncodingHandler++] = handler;
1681}
1682
1683/**
1684 * xmlGetCharEncodingHandler:
1685 * @enc: an xmlCharEncoding value.
1686 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001687 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001688 *
1689 * Returns the handler or NULL if not found
1690 */
1691xmlCharEncodingHandlerPtr
1692xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1693 xmlCharEncodingHandlerPtr handler;
1694
1695 if (handlers == NULL) xmlInitCharEncodingHandlers();
1696 switch (enc) {
1697 case XML_CHAR_ENCODING_ERROR:
1698 return(NULL);
1699 case XML_CHAR_ENCODING_NONE:
1700 return(NULL);
1701 case XML_CHAR_ENCODING_UTF8:
1702 return(NULL);
1703 case XML_CHAR_ENCODING_UTF16LE:
1704 return(xmlUTF16LEHandler);
1705 case XML_CHAR_ENCODING_UTF16BE:
1706 return(xmlUTF16BEHandler);
1707 case XML_CHAR_ENCODING_EBCDIC:
1708 handler = xmlFindCharEncodingHandler("EBCDIC");
1709 if (handler != NULL) return(handler);
1710 handler = xmlFindCharEncodingHandler("ebcdic");
1711 if (handler != NULL) return(handler);
1712 break;
1713 case XML_CHAR_ENCODING_UCS4BE:
1714 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1715 if (handler != NULL) return(handler);
1716 handler = xmlFindCharEncodingHandler("UCS-4");
1717 if (handler != NULL) return(handler);
1718 handler = xmlFindCharEncodingHandler("UCS4");
1719 if (handler != NULL) return(handler);
1720 break;
1721 case XML_CHAR_ENCODING_UCS4LE:
1722 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1723 if (handler != NULL) return(handler);
1724 handler = xmlFindCharEncodingHandler("UCS-4");
1725 if (handler != NULL) return(handler);
1726 handler = xmlFindCharEncodingHandler("UCS4");
1727 if (handler != NULL) return(handler);
1728 break;
1729 case XML_CHAR_ENCODING_UCS4_2143:
1730 break;
1731 case XML_CHAR_ENCODING_UCS4_3412:
1732 break;
1733 case XML_CHAR_ENCODING_UCS2:
1734 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1735 if (handler != NULL) return(handler);
1736 handler = xmlFindCharEncodingHandler("UCS-2");
1737 if (handler != NULL) return(handler);
1738 handler = xmlFindCharEncodingHandler("UCS2");
1739 if (handler != NULL) return(handler);
1740 break;
1741
1742 /*
1743 * We used to keep ISO Latin encodings native in the
1744 * generated data. This led to so many problems that
1745 * this has been removed. One can still change this
1746 * back by registering no-ops encoders for those
1747 */
1748 case XML_CHAR_ENCODING_8859_1:
1749 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1750 if (handler != NULL) return(handler);
1751 break;
1752 case XML_CHAR_ENCODING_8859_2:
1753 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1754 if (handler != NULL) return(handler);
1755 break;
1756 case XML_CHAR_ENCODING_8859_3:
1757 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1758 if (handler != NULL) return(handler);
1759 break;
1760 case XML_CHAR_ENCODING_8859_4:
1761 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1762 if (handler != NULL) return(handler);
1763 break;
1764 case XML_CHAR_ENCODING_8859_5:
1765 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1766 if (handler != NULL) return(handler);
1767 break;
1768 case XML_CHAR_ENCODING_8859_6:
1769 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1770 if (handler != NULL) return(handler);
1771 break;
1772 case XML_CHAR_ENCODING_8859_7:
1773 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1774 if (handler != NULL) return(handler);
1775 break;
1776 case XML_CHAR_ENCODING_8859_8:
1777 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1778 if (handler != NULL) return(handler);
1779 break;
1780 case XML_CHAR_ENCODING_8859_9:
1781 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1782 if (handler != NULL) return(handler);
1783 break;
1784
1785
1786 case XML_CHAR_ENCODING_2022_JP:
1787 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1788 if (handler != NULL) return(handler);
1789 break;
1790 case XML_CHAR_ENCODING_SHIFT_JIS:
1791 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1792 if (handler != NULL) return(handler);
1793 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1794 if (handler != NULL) return(handler);
1795 handler = xmlFindCharEncodingHandler("Shift_JIS");
1796 if (handler != NULL) return(handler);
1797 break;
1798 case XML_CHAR_ENCODING_EUC_JP:
1799 handler = xmlFindCharEncodingHandler("EUC-JP");
1800 if (handler != NULL) return(handler);
1801 break;
1802 default:
1803 break;
1804 }
1805
1806#ifdef DEBUG_ENCODING
1807 xmlGenericError(xmlGenericErrorContext,
1808 "No handler found for encoding %d\n", enc);
1809#endif
1810 return(NULL);
1811}
1812
1813/**
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001814 * xmlFindCharEncodingHandler:
1815 * @name: a string describing the char encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001816 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001817 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001818 *
1819 * Returns the handler or NULL if not found
1820 */
1821xmlCharEncodingHandlerPtr
1822xmlFindCharEncodingHandler(const char *name) {
1823 const char *nalias;
1824 const char *norig;
1825 xmlCharEncoding alias;
1826#ifdef LIBXML_ICONV_ENABLED
1827 xmlCharEncodingHandlerPtr enc;
1828 iconv_t icv_in, icv_out;
1829#endif /* LIBXML_ICONV_ENABLED */
1830 char upper[100];
1831 int i;
1832
1833 if (handlers == NULL) xmlInitCharEncodingHandlers();
1834 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1835 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1836
1837 /*
1838 * Do the alias resolution
1839 */
1840 norig = name;
1841 nalias = xmlGetEncodingAlias(name);
1842 if (nalias != NULL)
1843 name = nalias;
1844
1845 /*
1846 * Check first for directly registered encoding names
1847 */
1848 for (i = 0;i < 99;i++) {
1849 upper[i] = toupper(name[i]);
1850 if (upper[i] == 0) break;
1851 }
1852 upper[i] = 0;
1853
1854 for (i = 0;i < nbCharEncodingHandler; i++)
1855 if (!strcmp(upper, handlers[i]->name)) {
1856#ifdef DEBUG_ENCODING
1857 xmlGenericError(xmlGenericErrorContext,
1858 "Found registered handler for encoding %s\n", name);
1859#endif
1860 return(handlers[i]);
1861 }
1862
1863#ifdef LIBXML_ICONV_ENABLED
1864 /* check whether iconv can handle this */
1865 icv_in = iconv_open("UTF-8", name);
1866 icv_out = iconv_open(name, "UTF-8");
1867 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1868 enc = (xmlCharEncodingHandlerPtr)
1869 xmlMalloc(sizeof(xmlCharEncodingHandler));
1870 if (enc == NULL) {
1871 iconv_close(icv_in);
1872 iconv_close(icv_out);
1873 return(NULL);
1874 }
1875 enc->name = xmlMemStrdup(name);
1876 enc->input = NULL;
1877 enc->output = NULL;
1878 enc->iconv_in = icv_in;
1879 enc->iconv_out = icv_out;
1880#ifdef DEBUG_ENCODING
1881 xmlGenericError(xmlGenericErrorContext,
1882 "Found iconv handler for encoding %s\n", name);
1883#endif
1884 return enc;
1885 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1886 xmlGenericError(xmlGenericErrorContext,
1887 "iconv : problems with filters for '%s'\n", name);
1888 }
1889#endif /* LIBXML_ICONV_ENABLED */
1890
1891#ifdef DEBUG_ENCODING
1892 xmlGenericError(xmlGenericErrorContext,
1893 "No handler found for encoding %s\n", name);
1894#endif
1895
1896 /*
1897 * Fallback using the canonical names
1898 */
1899 alias = xmlParseCharEncoding(norig);
1900 if (alias != XML_CHAR_ENCODING_ERROR) {
1901 const char* canon;
1902 canon = xmlGetCharEncodingName(alias);
1903 if ((canon != NULL) && (strcmp(name, canon))) {
1904 return(xmlFindCharEncodingHandler(canon));
1905 }
1906 }
1907
1908 return(NULL);
1909}
1910
Daniel Veillard97ac1312001-05-30 19:14:17 +00001911/************************************************************************
1912 * *
1913 * ICONV based generic conversion functions *
1914 * *
1915 ************************************************************************/
1916
Owen Taylor3473f882001-02-23 17:55:21 +00001917#ifdef LIBXML_ICONV_ENABLED
1918/**
1919 * xmlIconvWrapper:
1920 * @cd: iconv converter data structure
1921 * @out: a pointer to an array of bytes to store the result
1922 * @outlen: the length of @out
1923 * @in: a pointer to an array of ISO Latin 1 chars
1924 * @inlen: the length of @in
1925 *
1926 * Returns 0 if success, or
1927 * -1 by lack of space, or
1928 * -2 if the transcoding fails (for *in is not valid utf8 string or
1929 * the result of transformation can't fit into the encoding we want), or
1930 * -3 if there the last byte can't form a single output char.
1931 *
1932 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001933 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001934 * The value of @outlen after return is the number of ocetes consumed.
1935 */
1936static int
1937xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001938 unsigned char *out, int *outlen,
1939 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001940
Daniel Veillard9403a042001-05-28 11:00:53 +00001941 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1942 const char *icv_in = (const char *) in;
1943 char *icv_out = (char *) out;
1944 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001945
Darin Adler699613b2001-07-27 22:47:14 +00001946 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard9403a042001-05-28 11:00:53 +00001947 if (in != NULL) {
1948 *inlen -= icv_inlen;
1949 *outlen -= icv_outlen;
1950 } else {
1951 *inlen = 0;
1952 *outlen = 0;
1953 }
1954 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001955#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00001956 if (errno == EILSEQ) {
1957 return -2;
1958 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001959#endif
1960#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00001961 if (errno == E2BIG) {
1962 return -1;
1963 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001964#endif
1965#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00001966 if (errno == EINVAL) {
1967 return -3;
1968 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001969#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00001970 {
1971 return -3;
1972 }
1973 }
1974 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001975}
1976#endif /* LIBXML_ICONV_ENABLED */
1977
Daniel Veillard97ac1312001-05-30 19:14:17 +00001978/************************************************************************
1979 * *
1980 * The real API used by libxml for on-the-fly conversion *
1981 * *
1982 ************************************************************************/
1983
Owen Taylor3473f882001-02-23 17:55:21 +00001984/**
1985 * xmlCharEncFirstLine:
1986 * @handler: char enconding transformation data structure
1987 * @out: an xmlBuffer for the output.
1988 * @in: an xmlBuffer for the input
1989 *
1990 * Front-end for the encoding handler input function, but handle only
1991 * the very first line, i.e. limit itself to 45 chars.
1992 *
1993 * Returns the number of byte written if success, or
1994 * -1 general error
1995 * -2 if the transcoding fails (for *in is not valid utf8 string or
1996 * the result of transformation can't fit into the encoding we want), or
1997 */
1998int
1999xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2000 xmlBufferPtr in) {
2001 int ret = -2;
2002 int written;
2003 int toconv;
2004
2005 if (handler == NULL) return(-1);
2006 if (out == NULL) return(-1);
2007 if (in == NULL) return(-1);
2008
2009 written = out->size - out->use;
2010 toconv = in->use;
2011 if (toconv * 2 >= written) {
2012 xmlBufferGrow(out, toconv);
2013 written = out->size - out->use - 1;
2014 }
2015
2016 /*
2017 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
2018 * 45 chars should be sufficient to reach the end of the encoding
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002019 * declaration without going too far inside the document content.
Owen Taylor3473f882001-02-23 17:55:21 +00002020 */
2021 written = 45;
2022
2023 if (handler->input != NULL) {
2024 ret = handler->input(&out->content[out->use], &written,
2025 in->content, &toconv);
2026 xmlBufferShrink(in, toconv);
2027 out->use += written;
2028 out->content[out->use] = 0;
2029 }
2030#ifdef LIBXML_ICONV_ENABLED
2031 else if (handler->iconv_in != NULL) {
2032 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2033 &written, in->content, &toconv);
2034 xmlBufferShrink(in, toconv);
2035 out->use += written;
2036 out->content[out->use] = 0;
2037 if (ret == -1) ret = -3;
2038 }
2039#endif /* LIBXML_ICONV_ENABLED */
2040#ifdef DEBUG_ENCODING
2041 switch (ret) {
2042 case 0:
2043 xmlGenericError(xmlGenericErrorContext,
2044 "converted %d bytes to %d bytes of input\n",
2045 toconv, written);
2046 break;
2047 case -1:
2048 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2049 toconv, written, in->use);
2050 break;
2051 case -2:
2052 xmlGenericError(xmlGenericErrorContext,
2053 "input conversion failed due to input error\n");
2054 break;
2055 case -3:
2056 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2057 toconv, written, in->use);
2058 break;
2059 default:
2060 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2061 }
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002062#endif /* DEBUG_ENCODING */
Owen Taylor3473f882001-02-23 17:55:21 +00002063 /*
2064 * Ignore when input buffer is not on a boundary
2065 */
2066 if (ret == -3) ret = 0;
2067 if (ret == -1) ret = 0;
2068 return(ret);
2069}
2070
2071/**
2072 * xmlCharEncInFunc:
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002073 * @handler: char encoding transformation data structure
Owen Taylor3473f882001-02-23 17:55:21 +00002074 * @out: an xmlBuffer for the output.
2075 * @in: an xmlBuffer for the input
2076 *
2077 * Generic front-end for the encoding handler input function
2078 *
2079 * Returns the number of byte written if success, or
2080 * -1 general error
2081 * -2 if the transcoding fails (for *in is not valid utf8 string or
2082 * the result of transformation can't fit into the encoding we want), or
2083 */
2084int
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002085xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2086 xmlBufferPtr in)
2087{
Owen Taylor3473f882001-02-23 17:55:21 +00002088 int ret = -2;
2089 int written;
2090 int toconv;
2091
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002092 if (handler == NULL)
2093 return (-1);
2094 if (out == NULL)
2095 return (-1);
2096 if (in == NULL)
2097 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00002098
2099 toconv = in->use;
2100 if (toconv == 0)
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002101 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00002102 written = out->size - out->use;
2103 if (toconv * 2 >= written) {
2104 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002105 written = out->size - out->use - 1;
Owen Taylor3473f882001-02-23 17:55:21 +00002106 }
2107 if (handler->input != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002108 ret = handler->input(&out->content[out->use], &written,
2109 in->content, &toconv);
2110 xmlBufferShrink(in, toconv);
2111 out->use += written;
2112 out->content[out->use] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002113 }
2114#ifdef LIBXML_ICONV_ENABLED
2115 else if (handler->iconv_in != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002116 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2117 &written, in->content, &toconv);
2118 xmlBufferShrink(in, toconv);
2119 out->use += written;
2120 out->content[out->use] = 0;
2121 if (ret == -1)
2122 ret = -3;
Owen Taylor3473f882001-02-23 17:55:21 +00002123 }
2124#endif /* LIBXML_ICONV_ENABLED */
2125 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002126 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002127#ifdef DEBUG_ENCODING
2128 xmlGenericError(xmlGenericErrorContext,
2129 "converted %d bytes to %d bytes of input\n",
2130 toconv, written);
Owen Taylor3473f882001-02-23 17:55:21 +00002131#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002132 break;
2133 case -1:
2134#ifdef DEBUG_ENCODING
2135 xmlGenericError(xmlGenericErrorContext,
2136 "converted %d bytes to %d bytes of input, %d left\n",
2137 toconv, written, in->use);
2138#endif
2139 break;
2140 case -3:
2141#ifdef DEBUG_ENCODING
2142 xmlGenericError(xmlGenericErrorContext,
2143 "converted %d bytes to %d bytes of input, %d left\n",
2144 toconv, written, in->use);
2145#endif
2146 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002147 case -2:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002148 xmlGenericError(xmlGenericErrorContext,
2149 "input conversion failed due to input error\n");
2150 xmlGenericError(xmlGenericErrorContext,
2151 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2152 in->content[0], in->content[1],
2153 in->content[2], in->content[3]);
Owen Taylor3473f882001-02-23 17:55:21 +00002154 }
2155 /*
2156 * Ignore when input buffer is not on a boundary
2157 */
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002158 if (ret == -3)
2159 ret = 0;
Daniel Veillardd076a202002-11-20 13:28:31 +00002160 return (written);
Owen Taylor3473f882001-02-23 17:55:21 +00002161}
2162
2163/**
2164 * xmlCharEncOutFunc:
2165 * @handler: char enconding transformation data structure
2166 * @out: an xmlBuffer for the output.
2167 * @in: an xmlBuffer for the input
2168 *
2169 * Generic front-end for the encoding handler output function
2170 * a first call with @in == NULL has to be made firs to initiate the
2171 * output in case of non-stateless encoding needing to initiate their
2172 * state or the output (like the BOM in UTF16).
2173 * In case of UTF8 sequence conversion errors for the given encoder,
2174 * the content will be automatically remapped to a CharRef sequence.
2175 *
2176 * Returns the number of byte written if success, or
2177 * -1 general error
2178 * -2 if the transcoding fails (for *in is not valid utf8 string or
2179 * the result of transformation can't fit into the encoding we want), or
2180 */
2181int
2182xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2183 xmlBufferPtr in) {
2184 int ret = -2;
2185 int written;
2186 int writtentot = 0;
2187 int toconv;
2188 int output = 0;
2189
2190 if (handler == NULL) return(-1);
2191 if (out == NULL) return(-1);
2192
2193retry:
2194
2195 written = out->size - out->use;
2196
Igor Zlatkovic73267db2003-03-08 13:29:24 +00002197 if (written > 0)
2198 written--; /* Gennady: count '/0' */
2199
Owen Taylor3473f882001-02-23 17:55:21 +00002200 /*
2201 * First specific handling of in = NULL, i.e. the initialization call
2202 */
2203 if (in == NULL) {
2204 toconv = 0;
2205 if (handler->output != NULL) {
2206 ret = handler->output(&out->content[out->use], &written,
2207 NULL, &toconv);
Igor Zlatkovic73267db2003-03-08 13:29:24 +00002208 if (ret == 0) { /* Gennady: check return value */
2209 out->use += written;
2210 out->content[out->use] = 0;
2211 }
Owen Taylor3473f882001-02-23 17:55:21 +00002212 }
2213#ifdef LIBXML_ICONV_ENABLED
2214 else if (handler->iconv_out != NULL) {
2215 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2216 &written, NULL, &toconv);
2217 out->use += written;
2218 out->content[out->use] = 0;
2219 }
2220#endif /* LIBXML_ICONV_ENABLED */
2221#ifdef DEBUG_ENCODING
2222 xmlGenericError(xmlGenericErrorContext,
2223 "initialized encoder\n");
2224#endif
2225 return(0);
2226 }
2227
2228 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002229 * Conversion itself.
Owen Taylor3473f882001-02-23 17:55:21 +00002230 */
2231 toconv = in->use;
2232 if (toconv == 0)
2233 return(0);
2234 if (toconv * 2 >= written) {
2235 xmlBufferGrow(out, toconv * 2);
2236 written = out->size - out->use - 1;
2237 }
2238 if (handler->output != NULL) {
2239 ret = handler->output(&out->content[out->use], &written,
2240 in->content, &toconv);
2241 xmlBufferShrink(in, toconv);
2242 out->use += written;
2243 writtentot += written;
2244 out->content[out->use] = 0;
2245 }
2246#ifdef LIBXML_ICONV_ENABLED
2247 else if (handler->iconv_out != NULL) {
2248 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2249 &written, in->content, &toconv);
2250 xmlBufferShrink(in, toconv);
2251 out->use += written;
2252 writtentot += written;
2253 out->content[out->use] = 0;
2254 if (ret == -1) {
2255 if (written > 0) {
2256 /*
2257 * Can be a limitation of iconv
2258 */
2259 goto retry;
2260 }
2261 ret = -3;
2262 }
2263 }
2264#endif /* LIBXML_ICONV_ENABLED */
2265 else {
2266 xmlGenericError(xmlGenericErrorContext,
2267 "xmlCharEncOutFunc: no output function !\n");
2268 return(-1);
2269 }
2270
2271 if (ret >= 0) output += ret;
2272
2273 /*
2274 * Attempt to handle error cases
2275 */
2276 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002277 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002278#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002279 xmlGenericError(xmlGenericErrorContext,
2280 "converted %d bytes to %d bytes of output\n",
2281 toconv, written);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002282#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002283 break;
2284 case -1:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002285#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002286 xmlGenericError(xmlGenericErrorContext,
2287 "output conversion failed by lack of space\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002288#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002289 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002290 case -3:
Daniel Veillard809faa52003-02-10 15:43:53 +00002291#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002292 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2293 toconv, written, in->use);
Daniel Veillard809faa52003-02-10 15:43:53 +00002294#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002295 break;
2296 case -2: {
2297 int len = in->use;
2298 const xmlChar *utf = (const xmlChar *) in->content;
2299 int cur;
2300
2301 cur = xmlGetUTF8Char(utf, &len);
2302 if (cur > 0) {
2303 xmlChar charref[20];
2304
2305#ifdef DEBUG_ENCODING
2306 xmlGenericError(xmlGenericErrorContext,
2307 "handling output conversion error\n");
2308 xmlGenericError(xmlGenericErrorContext,
2309 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2310 in->content[0], in->content[1],
2311 in->content[2], in->content[3]);
2312#endif
2313 /*
2314 * Removes the UTF8 sequence, and replace it by a charref
2315 * and continue the transcoding phase, hoping the error
2316 * did not mangle the encoder state.
2317 */
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002318 snprintf((char *) charref, sizeof(charref), "&#%d;", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002319 xmlBufferShrink(in, len);
2320 xmlBufferAddHead(in, charref, -1);
2321
2322 goto retry;
2323 } else {
2324 xmlGenericError(xmlGenericErrorContext,
2325 "output conversion failed due to conv error\n");
2326 xmlGenericError(xmlGenericErrorContext,
2327 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2328 in->content[0], in->content[1],
2329 in->content[2], in->content[3]);
2330 in->content[0] = ' ';
2331 }
2332 break;
2333 }
2334 }
2335 return(ret);
2336}
2337
2338/**
2339 * xmlCharEncCloseFunc:
2340 * @handler: char enconding transformation data structure
2341 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002342 * Generic front-end for encoding handler close function
Owen Taylor3473f882001-02-23 17:55:21 +00002343 *
2344 * Returns 0 if success, or -1 in case of error
2345 */
2346int
2347xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2348 int ret = 0;
2349 if (handler == NULL) return(-1);
2350 if (handler->name == NULL) return(-1);
2351#ifdef LIBXML_ICONV_ENABLED
2352 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002353 * Iconv handlers can be used only once, free the whole block.
Owen Taylor3473f882001-02-23 17:55:21 +00002354 * and the associated icon resources.
2355 */
2356 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2357 if (handler->name != NULL)
2358 xmlFree(handler->name);
2359 handler->name = NULL;
2360 if (handler->iconv_out != NULL) {
2361 if (iconv_close(handler->iconv_out))
2362 ret = -1;
2363 handler->iconv_out = NULL;
2364 }
2365 if (handler->iconv_in != NULL) {
2366 if (iconv_close(handler->iconv_in))
2367 ret = -1;
2368 handler->iconv_in = NULL;
2369 }
2370 xmlFree(handler);
2371 }
2372#endif /* LIBXML_ICONV_ENABLED */
2373#ifdef DEBUG_ENCODING
2374 if (ret)
2375 xmlGenericError(xmlGenericErrorContext,
2376 "failed to close the encoding handler\n");
2377 else
2378 xmlGenericError(xmlGenericErrorContext,
2379 "closed the encoding handler\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002380#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002381
Owen Taylor3473f882001-02-23 17:55:21 +00002382 return(ret);
2383}
2384