blob: d2d5ca230353a2bbfafe71e4d64aaa32fcb60ec4 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
Daniel Veillardc5d64342001-06-24 12:13:24 +000018 * daniel@veillard.com
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Daniel Veillard34ce8be2002-03-18 19:37:11 +000026#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000027#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000028
Owen Taylor3473f882001-02-23 17:55:21 +000029#include <string.h>
30
31#ifdef HAVE_CTYPE_H
32#include <ctype.h>
33#endif
34#ifdef HAVE_STDLIB_H
35#include <stdlib.h>
36#endif
Owen Taylor3473f882001-02-23 17:55:21 +000037#ifdef LIBXML_ICONV_ENABLED
38#ifdef HAVE_ERRNO_H
39#include <errno.h>
40#endif
41#endif
42#include <libxml/encoding.h>
43#include <libxml/xmlmemory.h>
44#ifdef LIBXML_HTML_ENABLED
45#include <libxml/HTMLparser.h>
46#endif
Daniel Veillard64a411c2001-10-15 12:32:07 +000047#include <libxml/globals.h>
Daniel Veillarda4617b82001-11-04 20:19:12 +000048#include <libxml/xmlerror.h>
Owen Taylor3473f882001-02-23 17:55:21 +000049
Daniel Veillard22090732001-07-16 00:06:07 +000050static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +000052
53typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
54typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
55struct _xmlCharEncodingAlias {
56 const char *name;
57 const char *alias;
58};
59
60static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
61static int xmlCharEncodingAliasesNb = 0;
62static int xmlCharEncodingAliasesMax = 0;
63
64#ifdef LIBXML_ICONV_ENABLED
65#if 0
66#define DEBUG_ENCODING /* Define this to get encoding traces */
67#endif
68#endif
69
70static int xmlLittleEndian = 1;
71
Daniel Veillard97ac1312001-05-30 19:14:17 +000072/************************************************************************
73 * *
74 * Generic UTF8 handling routines *
75 * *
76 * From rfc2044: encoding of the Unicode values on UTF-8: *
77 * *
78 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
79 * 0000 0000-0000 007F 0xxxxxxx *
80 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
81 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
82 * *
83 * I hope we won't use values > 0xFFFF anytime soon ! *
84 * *
85 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000086
87/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000088 * xmlUTF8Strlen:
89 * @utf: a sequence of UTF-8 encoded bytes
90 *
Daniel Veillard60087f32001-10-10 09:45:09 +000091 * compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillarde043ee12001-04-16 14:08:07 +000092 * checking of the content of the string.
93 *
94 * Returns the number of characters in the string or -1 in case of error
95 */
96int
Daniel Veillard97ac1312001-05-30 19:14:17 +000097xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +000098 int ret = 0;
99
100 if (utf == NULL)
101 return(-1);
102
103 while (*utf != 0) {
104 if (utf[0] & 0x80) {
105 if ((utf[1] & 0xc0) != 0x80)
106 return(-1);
107 if ((utf[0] & 0xe0) == 0xe0) {
108 if ((utf[2] & 0xc0) != 0x80)
109 return(-1);
110 if ((utf[0] & 0xf0) == 0xf0) {
111 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
112 return(-1);
113 utf += 4;
114 } else {
115 utf += 3;
116 }
117 } else {
118 utf += 2;
119 }
120 } else {
121 utf++;
122 }
123 ret++;
124 }
125 return(ret);
126}
127
128/**
Owen Taylor3473f882001-02-23 17:55:21 +0000129 * xmlGetUTF8Char:
130 * @utf: a sequence of UTF-8 encoded bytes
131 * @len: a pointer to @bytes len
132 *
133 * Read one UTF8 Char from @utf
134 *
135 * Returns the char value or -1 in case of error and update @len with the
136 * number of bytes used
137 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000138static int
Owen Taylor3473f882001-02-23 17:55:21 +0000139xmlGetUTF8Char(const unsigned char *utf, int *len) {
140 unsigned int c;
141
142 if (utf == NULL)
143 goto error;
144 if (len == NULL)
145 goto error;
146 if (*len < 1)
147 goto error;
148
149 c = utf[0];
150 if (c & 0x80) {
151 if (*len < 2)
152 goto error;
153 if ((utf[1] & 0xc0) != 0x80)
154 goto error;
155 if ((c & 0xe0) == 0xe0) {
156 if (*len < 3)
157 goto error;
158 if ((utf[2] & 0xc0) != 0x80)
159 goto error;
160 if ((c & 0xf0) == 0xf0) {
161 if (*len < 4)
162 goto error;
163 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
164 goto error;
165 *len = 4;
166 /* 4-byte code */
167 c = (utf[0] & 0x7) << 18;
168 c |= (utf[1] & 0x3f) << 12;
169 c |= (utf[2] & 0x3f) << 6;
170 c |= utf[3] & 0x3f;
171 } else {
172 /* 3-byte code */
173 *len = 3;
174 c = (utf[0] & 0xf) << 12;
175 c |= (utf[1] & 0x3f) << 6;
176 c |= utf[2] & 0x3f;
177 }
178 } else {
179 /* 2-byte code */
180 *len = 2;
181 c = (utf[0] & 0x1f) << 6;
182 c |= utf[1] & 0x3f;
183 }
184 } else {
185 /* 1-byte code */
186 *len = 1;
187 }
188 return(c);
189
190error:
191 *len = 0;
192 return(-1);
193}
194
195/**
196 * xmlCheckUTF8: Check utf-8 string for legality.
197 * @utf: Pointer to putative utf-8 encoded string.
198 *
199 * Checks @utf for being valid utf-8. @utf is assumed to be
200 * null-terminated. This function is not super-strict, as it will
201 * allow longer utf-8 sequences than necessary. Note that Java is
202 * capable of producing these sequences if provoked. Also note, this
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * routine checks for the 4-byte maximum size, but does not check for
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * 0x10ffff maximum value.
205 *
206 * Return value: true if @utf is valid.
207 **/
208int
209xmlCheckUTF8(const unsigned char *utf)
210{
211 int ix;
212 unsigned char c;
213
214 for (ix = 0; (c = utf[ix]);) {
215 if (c & 0x80) {
216 if ((utf[ix + 1] & 0xc0) != 0x80)
217 return(0);
218 if ((c & 0xe0) == 0xe0) {
219 if ((utf[ix + 2] & 0xc0) != 0x80)
220 return(0);
221 if ((c & 0xf0) == 0xf0) {
222 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
223 return(0);
224 ix += 4;
225 /* 4-byte code */
226 } else
227 /* 3-byte code */
228 ix += 3;
229 } else
230 /* 2-byte code */
231 ix += 2;
232 } else
233 /* 1-byte code */
234 ix++;
235 }
236 return(1);
237}
238
239/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000240 * xmlUTF8Strsize:
241 * @utf: a sequence of UTF-8 encoded bytes
242 * @len: the number of characters in the array
243 *
244 * storage size of an UTF8 string
245 *
246 * Returns the storage size of
247 * the first 'len' characters of ARRAY
248 *
249 */
250
251int
252xmlUTF8Strsize(const xmlChar *utf, int len) {
253 const xmlChar *ptr=utf;
254 xmlChar ch;
255
256 if (len <= 0)
257 return(0);
258
259 while ( len-- > 0) {
260 if ( !*ptr )
261 break;
262 if ( (ch = *ptr++) & 0x80)
263 while ( (ch<<=1) & 0x80 )
264 ptr++;
265 }
266 return (ptr - utf);
267}
268
269
270/**
271 * xmlUTF8Strndup:
272 * @utf: the input UTF8 *
273 * @len: the len of @utf (in chars)
274 *
275 * a strndup for array of UTF8's
276 *
277 * Returns a new UTF8 * or NULL
278 */
279xmlChar *
280xmlUTF8Strndup(const xmlChar *utf, int len) {
281 xmlChar *ret;
282 int i;
283
284 if ((utf == NULL) || (len < 0)) return(NULL);
285 i = xmlUTF8Strsize(utf, len);
286 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
287 if (ret == NULL) {
288 xmlGenericError(xmlGenericErrorContext,
289 "malloc of %ld byte failed\n",
290 (len + 1) * (long)sizeof(xmlChar));
291 return(NULL);
292 }
293 memcpy(ret, utf, i * sizeof(xmlChar));
294 ret[i] = 0;
295 return(ret);
296}
297
298/**
299 * xmlUTF8Strpos:
300 * @utf: the input UTF8 *
301 * @pos: the position of the desired UTF8 char (in chars)
302 *
303 * a function to provide the equivalent of fetching a
304 * character from a string array
305 *
306 * Returns a pointer to the UTF8 character or NULL
307 */
308xmlChar *
309xmlUTF8Strpos(const xmlChar *utf, int pos) {
310 xmlChar ch;
311
312 if (utf == NULL) return(NULL);
313 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
314 return(NULL);
315 while (pos--) {
316 if ((ch=*utf++) == 0) return(NULL);
317 if ( ch & 0x80 ) {
318 /* if not simple ascii, verify proper format */
319 if ( (ch & 0xc0) != 0xc0 )
320 return(NULL);
321 /* then skip over remaining bytes for this char */
322 while ( (ch <<= 1) & 0x80 )
323 if ( (*utf++ & 0xc0) != 0x80 )
324 return(NULL);
325 }
326 }
327 return((xmlChar *)utf);
328}
329
330/**
331 * xmlUTF8Strloc:
332 * @utf: the input UTF8 *
333 * @utfchar: the UTF8 character to be found
334 *
335 * a function to provide relative location of a UTF8 char
336 *
337 * Returns the relative character position of the desired char
338 * or -1 if not found
339 */
340int
341xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
342 int i, size;
343 xmlChar ch;
344
345 if (utf==NULL || utfchar==NULL) return -1;
346 size = xmlUTF8Strsize(utfchar, 1);
347 for(i=0; (ch=*utf) != 0; i++) {
348 if (xmlStrncmp(utf, utfchar, size)==0)
349 return(i);
350 utf++;
351 if ( ch & 0x80 ) {
352 /* if not simple ascii, verify proper format */
353 if ( (ch & 0xc0) != 0xc0 )
354 return(-1);
355 /* then skip over remaining bytes for this char */
356 while ( (ch <<= 1) & 0x80 )
357 if ( (*utf++ & 0xc0) != 0x80 )
358 return(-1);
359 }
360 }
361
362 return(-1);
363}
364/**
365 * xmlUTF8Strsub:
366 * @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard97ac1312001-05-30 19:14:17 +0000367 * @start: relative pos of first char
368 * @len: total number to copy
369 *
370 * Note: positions are given in units of UTF-8 chars
371 *
372 * Returns a pointer to a newly created string
373 * or NULL if any problem
374 */
375
376xmlChar *
377xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
378 int i;
379 xmlChar ch;
380
381 if (utf == NULL) return(NULL);
382 if (start < 0) return(NULL);
383 if (len < 0) return(NULL);
384
385 /*
386 * Skip over any leading chars
387 */
388 for (i = 0;i < start;i++) {
389 if ((ch=*utf++) == 0) return(NULL);
390 if ( ch & 0x80 ) {
391 /* if not simple ascii, verify proper format */
392 if ( (ch & 0xc0) != 0xc0 )
393 return(NULL);
394 /* then skip over remaining bytes for this char */
395 while ( (ch <<= 1) & 0x80 )
396 if ( (*utf++ & 0xc0) != 0x80 )
397 return(NULL);
398 }
399 }
400
401 return(xmlUTF8Strndup(utf, len));
402}
403
404/************************************************************************
405 * *
406 * Conversions To/From UTF8 encoding *
407 * *
408 ************************************************************************/
409
410/**
Owen Taylor3473f882001-02-23 17:55:21 +0000411 * asciiToUTF8:
412 * @out: a pointer to an array of bytes to store the result
413 * @outlen: the length of @out
414 * @in: a pointer to an array of ASCII chars
415 * @inlen: the length of @in
416 *
417 * Take a block of ASCII chars in and try to convert it to an UTF-8
418 * block of chars out.
419 * Returns 0 if success, or -1 otherwise
420 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000421 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000422 * The value of @outlen after return is the number of ocetes consumed.
423 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000424static int
Owen Taylor3473f882001-02-23 17:55:21 +0000425asciiToUTF8(unsigned char* out, int *outlen,
426 const unsigned char* in, int *inlen) {
427 unsigned char* outstart = out;
428 const unsigned char* base = in;
429 const unsigned char* processed = in;
430 unsigned char* outend = out + *outlen;
431 const unsigned char* inend;
432 unsigned int c;
433 int bits;
434
435 inend = in + (*inlen);
436 while ((in < inend) && (out - outstart + 5 < *outlen)) {
437 c= *in++;
438
439 /* assertion: c is a single UTF-4 value */
440 if (out >= outend)
441 break;
442 if (c < 0x80) { *out++= c; bits= -6; }
443 else {
444 *outlen = out - outstart;
445 *inlen = processed - base;
446 return(-1);
447 }
448
449 for ( ; bits >= 0; bits-= 6) {
450 if (out >= outend)
451 break;
452 *out++= ((c >> bits) & 0x3F) | 0x80;
453 }
454 processed = (const unsigned char*) in;
455 }
456 *outlen = out - outstart;
457 *inlen = processed - base;
458 return(0);
459}
460
461/**
462 * UTF8Toascii:
463 * @out: a pointer to an array of bytes to store the result
464 * @outlen: the length of @out
465 * @in: a pointer to an array of UTF-8 chars
466 * @inlen: the length of @in
467 *
468 * Take a block of UTF-8 chars in and try to convert it to an ASCII
469 * block of chars out.
470 *
471 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
472 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000473 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000474 * The value of @outlen after return is the number of ocetes consumed.
475 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000476static int
Owen Taylor3473f882001-02-23 17:55:21 +0000477UTF8Toascii(unsigned char* out, int *outlen,
478 const unsigned char* in, int *inlen) {
479 const unsigned char* processed = in;
480 const unsigned char* outend;
481 const unsigned char* outstart = out;
482 const unsigned char* instart = in;
483 const unsigned char* inend;
484 unsigned int c, d;
485 int trailing;
486
487 if (in == NULL) {
488 /*
489 * initialization nothing to do
490 */
491 *outlen = 0;
492 *inlen = 0;
493 return(0);
494 }
495 inend = in + (*inlen);
496 outend = out + (*outlen);
497 while (in < inend) {
498 d = *in++;
499 if (d < 0x80) { c= d; trailing= 0; }
500 else if (d < 0xC0) {
501 /* trailing byte in leading position */
502 *outlen = out - outstart;
503 *inlen = processed - instart;
504 return(-2);
505 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
506 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
507 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
508 else {
509 /* no chance for this in Ascii */
510 *outlen = out - outstart;
511 *inlen = processed - instart;
512 return(-2);
513 }
514
515 if (inend - in < trailing) {
516 break;
517 }
518
519 for ( ; trailing; trailing--) {
520 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
521 break;
522 c <<= 6;
523 c |= d & 0x3F;
524 }
525
526 /* assertion: c is a single UTF-4 value */
527 if (c < 0x80) {
528 if (out >= outend)
529 break;
530 *out++ = c;
531 } else {
532 /* no chance for this in Ascii */
533 *outlen = out - outstart;
534 *inlen = processed - instart;
535 return(-2);
536 }
537 processed = in;
538 }
539 *outlen = out - outstart;
540 *inlen = processed - instart;
541 return(0);
542}
543
544/**
545 * isolat1ToUTF8:
546 * @out: a pointer to an array of bytes to store the result
547 * @outlen: the length of @out
548 * @in: a pointer to an array of ISO Latin 1 chars
549 * @inlen: the length of @in
550 *
551 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
552 * block of chars out.
553 * Returns 0 if success, or -1 otherwise
554 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000555 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000556 * The value of @outlen after return is the number of ocetes consumed.
557 */
558int
559isolat1ToUTF8(unsigned char* out, int *outlen,
560 const unsigned char* in, int *inlen) {
561 unsigned char* outstart = out;
562 const unsigned char* base = in;
Owen Taylor3473f882001-02-23 17:55:21 +0000563 unsigned char* outend = out + *outlen;
564 const unsigned char* inend;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000565 const unsigned char* instop;
566 xmlChar c = *in;
Owen Taylor3473f882001-02-23 17:55:21 +0000567
568 inend = in + (*inlen);
Daniel Veillarde72c7562002-05-31 09:47:30 +0000569 instop = inend;
570
571 while (in < inend && out < outend - 1) {
572 if (c >= 0x80) {
Daniel Veillarddb552912002-03-21 13:27:59 +0000573 *out++= ((c >> 6) & 0x1F) | 0xC0;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000574 *out++= (c & 0x3F) | 0x80;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000575 ++in;
576 c = *in;
577 }
578 if (instop - in > outend - out) instop = in + (outend - out);
579 while (c < 0x80 && in < instop) {
580 *out++ = c;
581 ++in;
582 c = *in;
583 }
584 }
585 if (in < inend && out < outend && c < 0x80) {
586 *out++ = c;
587 ++in;
Owen Taylor3473f882001-02-23 17:55:21 +0000588 }
589 *outlen = out - outstart;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000590 *inlen = in - base;
Owen Taylor3473f882001-02-23 17:55:21 +0000591 return(0);
592}
593
Daniel Veillarde72c7562002-05-31 09:47:30 +0000594
Owen Taylor3473f882001-02-23 17:55:21 +0000595/**
596 * UTF8Toisolat1:
597 * @out: a pointer to an array of bytes to store the result
598 * @outlen: the length of @out
599 * @in: a pointer to an array of UTF-8 chars
600 * @inlen: the length of @in
601 *
602 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
603 * block of chars out.
604 *
605 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
606 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000607 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000608 * The value of @outlen after return is the number of ocetes consumed.
609 */
610int
611UTF8Toisolat1(unsigned char* out, int *outlen,
612 const unsigned char* in, int *inlen) {
613 const unsigned char* processed = in;
614 const unsigned char* outend;
615 const unsigned char* outstart = out;
616 const unsigned char* instart = in;
617 const unsigned char* inend;
618 unsigned int c, d;
619 int trailing;
620
621 if (in == NULL) {
622 /*
623 * initialization nothing to do
624 */
625 *outlen = 0;
626 *inlen = 0;
627 return(0);
628 }
629 inend = in + (*inlen);
630 outend = out + (*outlen);
631 while (in < inend) {
632 d = *in++;
633 if (d < 0x80) { c= d; trailing= 0; }
634 else if (d < 0xC0) {
635 /* trailing byte in leading position */
636 *outlen = out - outstart;
637 *inlen = processed - instart;
638 return(-2);
639 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
640 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
641 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
642 else {
643 /* no chance for this in IsoLat1 */
644 *outlen = out - outstart;
645 *inlen = processed - instart;
646 return(-2);
647 }
648
649 if (inend - in < trailing) {
650 break;
651 }
652
653 for ( ; trailing; trailing--) {
654 if (in >= inend)
655 break;
656 if (((d= *in++) & 0xC0) != 0x80) {
657 *outlen = out - outstart;
658 *inlen = processed - instart;
659 return(-2);
660 }
661 c <<= 6;
662 c |= d & 0x3F;
663 }
664
665 /* assertion: c is a single UTF-4 value */
666 if (c <= 0xFF) {
667 if (out >= outend)
668 break;
669 *out++ = c;
670 } else {
671 /* no chance for this in IsoLat1 */
672 *outlen = out - outstart;
673 *inlen = processed - instart;
674 return(-2);
675 }
676 processed = in;
677 }
678 *outlen = out - outstart;
679 *inlen = processed - instart;
680 return(0);
681}
682
683/**
684 * UTF16LEToUTF8:
685 * @out: a pointer to an array of bytes to store the result
686 * @outlen: the length of @out
687 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
688 * @inlenb: the length of @in in UTF-16LE chars
689 *
690 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000691 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000692 * is the same between the native type of this machine and the
693 * inputed one.
694 *
695 * Returns the number of byte written, or -1 by lack of space, or -2
696 * if the transcoding fails (for *in is not valid utf16 string)
697 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000698 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000699 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000700static int
Owen Taylor3473f882001-02-23 17:55:21 +0000701UTF16LEToUTF8(unsigned char* out, int *outlen,
702 const unsigned char* inb, int *inlenb)
703{
704 unsigned char* outstart = out;
705 const unsigned char* processed = inb;
706 unsigned char* outend = out + *outlen;
707 unsigned short* in = (unsigned short*) inb;
708 unsigned short* inend;
709 unsigned int c, d, inlen;
710 unsigned char *tmp;
711 int bits;
712
713 if ((*inlenb % 2) == 1)
714 (*inlenb)--;
715 inlen = *inlenb / 2;
716 inend = in + inlen;
717 while ((in < inend) && (out - outstart + 5 < *outlen)) {
718 if (xmlLittleEndian) {
719 c= *in++;
720 } else {
721 tmp = (unsigned char *) in;
722 c = *tmp++;
723 c = c | (((unsigned int)*tmp) << 8);
724 in++;
725 }
726 if ((c & 0xFC00) == 0xD800) { /* surrogates */
727 if (in >= inend) { /* (in > inend) shouldn't happens */
728 break;
729 }
730 if (xmlLittleEndian) {
731 d = *in++;
732 } else {
733 tmp = (unsigned char *) in;
734 d = *tmp++;
735 d = d | (((unsigned int)*tmp) << 8);
736 in++;
737 }
738 if ((d & 0xFC00) == 0xDC00) {
739 c &= 0x03FF;
740 c <<= 10;
741 c |= d & 0x03FF;
742 c += 0x10000;
743 }
744 else {
745 *outlen = out - outstart;
746 *inlenb = processed - inb;
747 return(-2);
748 }
749 }
750
751 /* assertion: c is a single UTF-4 value */
752 if (out >= outend)
753 break;
754 if (c < 0x80) { *out++= c; bits= -6; }
755 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
756 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
757 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
758
759 for ( ; bits >= 0; bits-= 6) {
760 if (out >= outend)
761 break;
762 *out++= ((c >> bits) & 0x3F) | 0x80;
763 }
764 processed = (const unsigned char*) in;
765 }
766 *outlen = out - outstart;
767 *inlenb = processed - inb;
768 return(0);
769}
770
771/**
772 * UTF8ToUTF16LE:
773 * @outb: a pointer to an array of bytes to store the result
774 * @outlen: the length of @outb
775 * @in: a pointer to an array of UTF-8 chars
776 * @inlen: the length of @in
777 *
778 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
779 * block of chars out.
780 *
781 * Returns the number of byte written, or -1 by lack of space, or -2
782 * if the transcoding failed.
783 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000784static int
Owen Taylor3473f882001-02-23 17:55:21 +0000785UTF8ToUTF16LE(unsigned char* outb, int *outlen,
786 const unsigned char* in, int *inlen)
787{
788 unsigned short* out = (unsigned short*) outb;
789 const unsigned char* processed = in;
790 unsigned short* outstart= out;
791 unsigned short* outend;
792 const unsigned char* inend= in+*inlen;
793 unsigned int c, d;
794 int trailing;
795 unsigned char *tmp;
796 unsigned short tmp1, tmp2;
797
798 if (in == NULL) {
799 /*
800 * initialization, add the Byte Order Mark
801 */
802 if (*outlen >= 2) {
803 outb[0] = 0xFF;
804 outb[1] = 0xFE;
805 *outlen = 2;
806 *inlen = 0;
807#ifdef DEBUG_ENCODING
808 xmlGenericError(xmlGenericErrorContext,
809 "Added FFFE Byte Order Mark\n");
810#endif
811 return(2);
812 }
813 *outlen = 0;
814 *inlen = 0;
815 return(0);
816 }
817 outend = out + (*outlen / 2);
818 while (in < inend) {
819 d= *in++;
820 if (d < 0x80) { c= d; trailing= 0; }
821 else if (d < 0xC0) {
822 /* trailing byte in leading position */
823 *outlen = (out - outstart) * 2;
824 *inlen = processed - in;
825 return(-2);
826 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
827 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
828 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
829 else {
830 /* no chance for this in UTF-16 */
831 *outlen = (out - outstart) * 2;
832 *inlen = processed - in;
833 return(-2);
834 }
835
836 if (inend - in < trailing) {
837 break;
838 }
839
840 for ( ; trailing; trailing--) {
841 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
842 break;
843 c <<= 6;
844 c |= d & 0x3F;
845 }
846
847 /* assertion: c is a single UTF-4 value */
848 if (c < 0x10000) {
849 if (out >= outend)
850 break;
851 if (xmlLittleEndian) {
852 *out++ = c;
853 } else {
854 tmp = (unsigned char *) out;
855 *tmp = c ;
856 *(tmp + 1) = c >> 8 ;
857 out++;
858 }
859 }
860 else if (c < 0x110000) {
861 if (out+1 >= outend)
862 break;
863 c -= 0x10000;
864 if (xmlLittleEndian) {
865 *out++ = 0xD800 | (c >> 10);
866 *out++ = 0xDC00 | (c & 0x03FF);
867 } else {
868 tmp1 = 0xD800 | (c >> 10);
869 tmp = (unsigned char *) out;
870 *tmp = (unsigned char) tmp1;
871 *(tmp + 1) = tmp1 >> 8;
872 out++;
873
874 tmp2 = 0xDC00 | (c & 0x03FF);
875 tmp = (unsigned char *) out;
876 *tmp = (unsigned char) tmp2;
877 *(tmp + 1) = tmp2 >> 8;
878 out++;
879 }
880 }
881 else
882 break;
883 processed = in;
884 }
885 *outlen = (out - outstart) * 2;
886 *inlen = processed - in;
887 return(0);
888}
889
890/**
891 * UTF16BEToUTF8:
892 * @out: a pointer to an array of bytes to store the result
893 * @outlen: the length of @out
894 * @inb: a pointer to an array of UTF-16 passwd as a byte array
895 * @inlenb: the length of @in in UTF-16 chars
896 *
897 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000898 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000899 * is the same between the native type of this machine and the
900 * inputed one.
901 *
902 * Returns the number of byte written, or -1 by lack of space, or -2
903 * if the transcoding fails (for *in is not valid utf16 string)
904 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000905 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000906 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000907static int
Owen Taylor3473f882001-02-23 17:55:21 +0000908UTF16BEToUTF8(unsigned char* out, int *outlen,
909 const unsigned char* inb, int *inlenb)
910{
911 unsigned char* outstart = out;
912 const unsigned char* processed = inb;
913 unsigned char* outend = out + *outlen;
914 unsigned short* in = (unsigned short*) inb;
915 unsigned short* inend;
916 unsigned int c, d, inlen;
917 unsigned char *tmp;
918 int bits;
919
920 if ((*inlenb % 2) == 1)
921 (*inlenb)--;
922 inlen = *inlenb / 2;
923 inend= in + inlen;
924 while (in < inend) {
925 if (xmlLittleEndian) {
926 tmp = (unsigned char *) in;
927 c = *tmp++;
928 c = c << 8;
929 c = c | (unsigned int) *tmp;
930 in++;
931 } else {
932 c= *in++;
933 }
934 if ((c & 0xFC00) == 0xD800) { /* surrogates */
935 if (in >= inend) { /* (in > inend) shouldn't happens */
936 *outlen = out - outstart;
937 *inlenb = processed - inb;
938 return(-2);
939 }
940 if (xmlLittleEndian) {
941 tmp = (unsigned char *) in;
942 d = *tmp++;
943 d = d << 8;
944 d = d | (unsigned int) *tmp;
945 in++;
946 } else {
947 d= *in++;
948 }
949 if ((d & 0xFC00) == 0xDC00) {
950 c &= 0x03FF;
951 c <<= 10;
952 c |= d & 0x03FF;
953 c += 0x10000;
954 }
955 else {
956 *outlen = out - outstart;
957 *inlenb = processed - inb;
958 return(-2);
959 }
960 }
961
962 /* assertion: c is a single UTF-4 value */
963 if (out >= outend)
964 break;
965 if (c < 0x80) { *out++= c; bits= -6; }
966 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
967 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
968 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
969
970 for ( ; bits >= 0; bits-= 6) {
971 if (out >= outend)
972 break;
973 *out++= ((c >> bits) & 0x3F) | 0x80;
974 }
975 processed = (const unsigned char*) in;
976 }
977 *outlen = out - outstart;
978 *inlenb = processed - inb;
979 return(0);
980}
981
982/**
983 * UTF8ToUTF16BE:
984 * @outb: a pointer to an array of bytes to store the result
985 * @outlen: the length of @outb
986 * @in: a pointer to an array of UTF-8 chars
987 * @inlen: the length of @in
988 *
989 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
990 * block of chars out.
991 *
992 * Returns the number of byte written, or -1 by lack of space, or -2
993 * if the transcoding failed.
994 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000995static int
Owen Taylor3473f882001-02-23 17:55:21 +0000996UTF8ToUTF16BE(unsigned char* outb, int *outlen,
997 const unsigned char* in, int *inlen)
998{
999 unsigned short* out = (unsigned short*) outb;
1000 const unsigned char* processed = in;
1001 unsigned short* outstart= out;
1002 unsigned short* outend;
1003 const unsigned char* inend= in+*inlen;
1004 unsigned int c, d;
1005 int trailing;
1006 unsigned char *tmp;
1007 unsigned short tmp1, tmp2;
1008
1009 if (in == NULL) {
1010 /*
1011 * initialization, add the Byte Order Mark
1012 */
1013 if (*outlen >= 2) {
1014 outb[0] = 0xFE;
1015 outb[1] = 0xFF;
1016 *outlen = 2;
1017 *inlen = 0;
1018#ifdef DEBUG_ENCODING
1019 xmlGenericError(xmlGenericErrorContext,
1020 "Added FEFF Byte Order Mark\n");
1021#endif
1022 return(2);
1023 }
1024 *outlen = 0;
1025 *inlen = 0;
1026 return(0);
1027 }
1028 outend = out + (*outlen / 2);
1029 while (in < inend) {
1030 d= *in++;
1031 if (d < 0x80) { c= d; trailing= 0; }
1032 else if (d < 0xC0) {
1033 /* trailing byte in leading position */
1034 *outlen = out - outstart;
1035 *inlen = processed - in;
1036 return(-2);
1037 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1038 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1039 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1040 else {
1041 /* no chance for this in UTF-16 */
1042 *outlen = out - outstart;
1043 *inlen = processed - in;
1044 return(-2);
1045 }
1046
1047 if (inend - in < trailing) {
1048 break;
1049 }
1050
1051 for ( ; trailing; trailing--) {
1052 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1053 c <<= 6;
1054 c |= d & 0x3F;
1055 }
1056
1057 /* assertion: c is a single UTF-4 value */
1058 if (c < 0x10000) {
1059 if (out >= outend) break;
1060 if (xmlLittleEndian) {
1061 tmp = (unsigned char *) out;
1062 *tmp = c >> 8;
1063 *(tmp + 1) = c;
1064 out++;
1065 } else {
1066 *out++ = c;
1067 }
1068 }
1069 else if (c < 0x110000) {
1070 if (out+1 >= outend) break;
1071 c -= 0x10000;
1072 if (xmlLittleEndian) {
1073 tmp1 = 0xD800 | (c >> 10);
1074 tmp = (unsigned char *) out;
1075 *tmp = tmp1 >> 8;
1076 *(tmp + 1) = (unsigned char) tmp1;
1077 out++;
1078
1079 tmp2 = 0xDC00 | (c & 0x03FF);
1080 tmp = (unsigned char *) out;
1081 *tmp = tmp2 >> 8;
1082 *(tmp + 1) = (unsigned char) tmp2;
1083 out++;
1084 } else {
1085 *out++ = 0xD800 | (c >> 10);
1086 *out++ = 0xDC00 | (c & 0x03FF);
1087 }
1088 }
1089 else
1090 break;
1091 processed = in;
1092 }
1093 *outlen = (out - outstart) * 2;
1094 *inlen = processed - in;
1095 return(0);
1096}
1097
Daniel Veillard97ac1312001-05-30 19:14:17 +00001098/************************************************************************
1099 * *
1100 * Generic encoding handling routines *
1101 * *
1102 ************************************************************************/
1103
Owen Taylor3473f882001-02-23 17:55:21 +00001104/**
1105 * xmlDetectCharEncoding:
1106 * @in: a pointer to the first bytes of the XML entity, must be at least
1107 * 4 bytes long.
1108 * @len: pointer to the length of the buffer
1109 *
1110 * Guess the encoding of the entity using the first bytes of the entity content
1111 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1112 *
1113 * Returns one of the XML_CHAR_ENCODING_... values.
1114 */
1115xmlCharEncoding
1116xmlDetectCharEncoding(const unsigned char* in, int len)
1117{
1118 if (len >= 4) {
1119 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1120 (in[2] == 0x00) && (in[3] == 0x3C))
1121 return(XML_CHAR_ENCODING_UCS4BE);
1122 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1123 (in[2] == 0x00) && (in[3] == 0x00))
1124 return(XML_CHAR_ENCODING_UCS4LE);
1125 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1126 (in[2] == 0x3C) && (in[3] == 0x00))
1127 return(XML_CHAR_ENCODING_UCS4_2143);
1128 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1129 (in[2] == 0x00) && (in[3] == 0x00))
1130 return(XML_CHAR_ENCODING_UCS4_3412);
1131 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1132 (in[2] == 0xA7) && (in[3] == 0x94))
1133 return(XML_CHAR_ENCODING_EBCDIC);
1134 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1135 (in[2] == 0x78) && (in[3] == 0x6D))
1136 return(XML_CHAR_ENCODING_UTF8);
1137 }
Daniel Veillard87a764e2001-06-20 17:41:10 +00001138 if (len >= 3) {
1139 /*
1140 * Errata on XML-1.0 June 20 2001
1141 * We now allow an UTF8 encoded BOM
1142 */
1143 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1144 (in[2] == 0xBF))
1145 return(XML_CHAR_ENCODING_UTF8);
1146 }
Owen Taylor3473f882001-02-23 17:55:21 +00001147 if (len >= 2) {
1148 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1149 return(XML_CHAR_ENCODING_UTF16BE);
1150 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1151 return(XML_CHAR_ENCODING_UTF16LE);
1152 }
1153 return(XML_CHAR_ENCODING_NONE);
1154}
1155
1156/**
1157 * xmlCleanupEncodingAliases:
1158 *
1159 * Unregisters all aliases
1160 */
1161void
1162xmlCleanupEncodingAliases(void) {
1163 int i;
1164
1165 if (xmlCharEncodingAliases == NULL)
1166 return;
1167
1168 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1169 if (xmlCharEncodingAliases[i].name != NULL)
1170 xmlFree((char *) xmlCharEncodingAliases[i].name);
1171 if (xmlCharEncodingAliases[i].alias != NULL)
1172 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1173 }
1174 xmlCharEncodingAliasesNb = 0;
1175 xmlCharEncodingAliasesMax = 0;
1176 xmlFree(xmlCharEncodingAliases);
Daniel Veillard73c6e532002-01-08 13:15:33 +00001177 xmlCharEncodingAliases = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001178}
1179
1180/**
1181 * xmlGetEncodingAlias:
1182 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1183 *
1184 * Lookup an encoding name for the given alias.
1185 *
1186 * Returns NULL if not found the original name otherwise
1187 */
1188const char *
1189xmlGetEncodingAlias(const char *alias) {
1190 int i;
1191 char upper[100];
1192
1193 if (alias == NULL)
1194 return(NULL);
1195
1196 if (xmlCharEncodingAliases == NULL)
1197 return(NULL);
1198
1199 for (i = 0;i < 99;i++) {
1200 upper[i] = toupper(alias[i]);
1201 if (upper[i] == 0) break;
1202 }
1203 upper[i] = 0;
1204
1205 /*
1206 * Walk down the list looking for a definition of the alias
1207 */
1208 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1209 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1210 return(xmlCharEncodingAliases[i].name);
1211 }
1212 }
1213 return(NULL);
1214}
1215
1216/**
1217 * xmlAddEncodingAlias:
1218 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1219 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1220 *
1221 * Registers and alias @alias for an encoding named @name. Existing alias
1222 * will be overwritten.
1223 *
1224 * Returns 0 in case of success, -1 in case of error
1225 */
1226int
1227xmlAddEncodingAlias(const char *name, const char *alias) {
1228 int i;
1229 char upper[100];
1230
1231 if ((name == NULL) || (alias == NULL))
1232 return(-1);
1233
1234 for (i = 0;i < 99;i++) {
1235 upper[i] = toupper(alias[i]);
1236 if (upper[i] == 0) break;
1237 }
1238 upper[i] = 0;
1239
1240 if (xmlCharEncodingAliases == NULL) {
1241 xmlCharEncodingAliasesNb = 0;
1242 xmlCharEncodingAliasesMax = 20;
1243 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1244 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1245 if (xmlCharEncodingAliases == NULL)
1246 return(-1);
1247 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1248 xmlCharEncodingAliasesMax *= 2;
1249 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1250 xmlRealloc(xmlCharEncodingAliases,
1251 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1252 }
1253 /*
1254 * Walk down the list looking for a definition of the alias
1255 */
1256 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1257 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1258 /*
1259 * Replace the definition.
1260 */
1261 xmlFree((char *) xmlCharEncodingAliases[i].name);
1262 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1263 return(0);
1264 }
1265 }
1266 /*
1267 * Add the definition
1268 */
1269 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1270 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1271 xmlCharEncodingAliasesNb++;
1272 return(0);
1273}
1274
1275/**
1276 * xmlDelEncodingAlias:
1277 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1278 *
1279 * Unregisters an encoding alias @alias
1280 *
1281 * Returns 0 in case of success, -1 in case of error
1282 */
1283int
1284xmlDelEncodingAlias(const char *alias) {
1285 int i;
1286
1287 if (alias == NULL)
1288 return(-1);
1289
1290 if (xmlCharEncodingAliases == NULL)
1291 return(-1);
1292 /*
1293 * Walk down the list looking for a definition of the alias
1294 */
1295 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1296 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1297 xmlFree((char *) xmlCharEncodingAliases[i].name);
1298 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1299 xmlCharEncodingAliasesNb--;
1300 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1301 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1302 return(0);
1303 }
1304 }
1305 return(-1);
1306}
1307
1308/**
1309 * xmlParseCharEncoding:
1310 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1311 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001312 * Compare the string to the known encoding schemes already known. Note
Owen Taylor3473f882001-02-23 17:55:21 +00001313 * that the comparison is case insensitive accordingly to the section
1314 * [XML] 4.3.3 Character Encoding in Entities.
1315 *
1316 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1317 * if not recognized.
1318 */
1319xmlCharEncoding
1320xmlParseCharEncoding(const char* name)
1321{
1322 const char *alias;
1323 char upper[500];
1324 int i;
1325
1326 if (name == NULL)
1327 return(XML_CHAR_ENCODING_NONE);
1328
1329 /*
1330 * Do the alias resolution
1331 */
1332 alias = xmlGetEncodingAlias(name);
1333 if (alias != NULL)
1334 name = alias;
1335
1336 for (i = 0;i < 499;i++) {
1337 upper[i] = toupper(name[i]);
1338 if (upper[i] == 0) break;
1339 }
1340 upper[i] = 0;
1341
1342 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1343 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1344 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1345
1346 /*
1347 * NOTE: if we were able to parse this, the endianness of UTF16 is
1348 * already found and in use
1349 */
1350 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1351 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1352
1353 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1354 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1355 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1356
1357 /*
1358 * NOTE: if we were able to parse this, the endianness of UCS4 is
1359 * already found and in use
1360 */
1361 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1362 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1363 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1364
1365
1366 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1367 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1368 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1369
1370 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1371 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1372 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1373
1374 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1375 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1376 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1377 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1378 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1379 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1380 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1381
1382 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1383 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1384 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1385
1386#ifdef DEBUG_ENCODING
1387 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1388#endif
1389 return(XML_CHAR_ENCODING_ERROR);
1390}
1391
1392/**
1393 * xmlGetCharEncodingName:
1394 * @enc: the encoding
1395 *
1396 * The "canonical" name for XML encoding.
1397 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1398 * Section 4.3.3 Character Encoding in Entities
1399 *
1400 * Returns the canonical name for the given encoding
1401 */
1402
1403const char*
1404xmlGetCharEncodingName(xmlCharEncoding enc) {
1405 switch (enc) {
1406 case XML_CHAR_ENCODING_ERROR:
1407 return(NULL);
1408 case XML_CHAR_ENCODING_NONE:
1409 return(NULL);
1410 case XML_CHAR_ENCODING_UTF8:
1411 return("UTF-8");
1412 case XML_CHAR_ENCODING_UTF16LE:
1413 return("UTF-16");
1414 case XML_CHAR_ENCODING_UTF16BE:
1415 return("UTF-16");
1416 case XML_CHAR_ENCODING_EBCDIC:
1417 return("EBCDIC");
1418 case XML_CHAR_ENCODING_UCS4LE:
1419 return("ISO-10646-UCS-4");
1420 case XML_CHAR_ENCODING_UCS4BE:
1421 return("ISO-10646-UCS-4");
1422 case XML_CHAR_ENCODING_UCS4_2143:
1423 return("ISO-10646-UCS-4");
1424 case XML_CHAR_ENCODING_UCS4_3412:
1425 return("ISO-10646-UCS-4");
1426 case XML_CHAR_ENCODING_UCS2:
1427 return("ISO-10646-UCS-2");
1428 case XML_CHAR_ENCODING_8859_1:
1429 return("ISO-8859-1");
1430 case XML_CHAR_ENCODING_8859_2:
1431 return("ISO-8859-2");
1432 case XML_CHAR_ENCODING_8859_3:
1433 return("ISO-8859-3");
1434 case XML_CHAR_ENCODING_8859_4:
1435 return("ISO-8859-4");
1436 case XML_CHAR_ENCODING_8859_5:
1437 return("ISO-8859-5");
1438 case XML_CHAR_ENCODING_8859_6:
1439 return("ISO-8859-6");
1440 case XML_CHAR_ENCODING_8859_7:
1441 return("ISO-8859-7");
1442 case XML_CHAR_ENCODING_8859_8:
1443 return("ISO-8859-8");
1444 case XML_CHAR_ENCODING_8859_9:
1445 return("ISO-8859-9");
1446 case XML_CHAR_ENCODING_2022_JP:
1447 return("ISO-2022-JP");
1448 case XML_CHAR_ENCODING_SHIFT_JIS:
1449 return("Shift-JIS");
1450 case XML_CHAR_ENCODING_EUC_JP:
1451 return("EUC-JP");
1452 case XML_CHAR_ENCODING_ASCII:
1453 return(NULL);
1454 }
1455 return(NULL);
1456}
1457
Daniel Veillard97ac1312001-05-30 19:14:17 +00001458/************************************************************************
1459 * *
1460 * Char encoding handlers *
1461 * *
1462 ************************************************************************/
1463
Owen Taylor3473f882001-02-23 17:55:21 +00001464
1465/* the size should be growable, but it's not a big deal ... */
1466#define MAX_ENCODING_HANDLERS 50
1467static xmlCharEncodingHandlerPtr *handlers = NULL;
1468static int nbCharEncodingHandler = 0;
1469
1470/*
1471 * The default is UTF-8 for XML, that's also the default used for the
1472 * parser internals, so the default encoding handler is NULL
1473 */
1474
1475static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1476
1477/**
1478 * xmlNewCharEncodingHandler:
1479 * @name: the encoding name, in UTF-8 format (ASCII actually)
1480 * @input: the xmlCharEncodingInputFunc to read that encoding
1481 * @output: the xmlCharEncodingOutputFunc to write that encoding
1482 *
1483 * Create and registers an xmlCharEncodingHandler.
1484 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1485 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001486static xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001487xmlNewCharEncodingHandler(const char *name,
1488 xmlCharEncodingInputFunc input,
1489 xmlCharEncodingOutputFunc output) {
1490 xmlCharEncodingHandlerPtr handler;
1491 const char *alias;
1492 char upper[500];
1493 int i;
1494 char *up = 0;
1495
1496 /*
1497 * Do the alias resolution
1498 */
1499 alias = xmlGetEncodingAlias(name);
1500 if (alias != NULL)
1501 name = alias;
1502
1503 /*
1504 * Keep only the uppercase version of the encoding.
1505 */
1506 if (name == NULL) {
1507 xmlGenericError(xmlGenericErrorContext,
1508 "xmlNewCharEncodingHandler : no name !\n");
1509 return(NULL);
1510 }
1511 for (i = 0;i < 499;i++) {
1512 upper[i] = toupper(name[i]);
1513 if (upper[i] == 0) break;
1514 }
1515 upper[i] = 0;
1516 up = xmlMemStrdup(upper);
1517 if (up == NULL) {
1518 xmlGenericError(xmlGenericErrorContext,
1519 "xmlNewCharEncodingHandler : out of memory !\n");
1520 return(NULL);
1521 }
1522
1523 /*
1524 * allocate and fill-up an handler block.
1525 */
1526 handler = (xmlCharEncodingHandlerPtr)
1527 xmlMalloc(sizeof(xmlCharEncodingHandler));
1528 if (handler == NULL) {
1529 xmlGenericError(xmlGenericErrorContext,
1530 "xmlNewCharEncodingHandler : out of memory !\n");
1531 return(NULL);
1532 }
1533 handler->input = input;
1534 handler->output = output;
1535 handler->name = up;
1536
1537#ifdef LIBXML_ICONV_ENABLED
1538 handler->iconv_in = NULL;
1539 handler->iconv_out = NULL;
1540#endif /* LIBXML_ICONV_ENABLED */
1541
1542 /*
1543 * registers and returns the handler.
1544 */
1545 xmlRegisterCharEncodingHandler(handler);
1546#ifdef DEBUG_ENCODING
1547 xmlGenericError(xmlGenericErrorContext,
1548 "Registered encoding handler for %s\n", name);
1549#endif
1550 return(handler);
1551}
1552
1553/**
1554 * xmlInitCharEncodingHandlers:
1555 *
1556 * Initialize the char encoding support, it registers the default
1557 * encoding supported.
1558 * NOTE: while public, this function usually doesn't need to be called
1559 * in normal processing.
1560 */
1561void
1562xmlInitCharEncodingHandlers(void) {
1563 unsigned short int tst = 0x1234;
1564 unsigned char *ptr = (unsigned char *) &tst;
1565
1566 if (handlers != NULL) return;
1567
1568 handlers = (xmlCharEncodingHandlerPtr *)
1569 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1570
1571 if (*ptr == 0x12) xmlLittleEndian = 0;
1572 else if (*ptr == 0x34) xmlLittleEndian = 1;
1573 else xmlGenericError(xmlGenericErrorContext,
1574 "Odd problem at endianness detection\n");
1575
1576 if (handlers == NULL) {
1577 xmlGenericError(xmlGenericErrorContext,
1578 "xmlInitCharEncodingHandlers : out of memory !\n");
1579 return;
1580 }
1581 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1582 xmlUTF16LEHandler =
1583 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1584 xmlUTF16BEHandler =
1585 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1586 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1587 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001588 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001589#ifdef LIBXML_HTML_ENABLED
1590 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1591#endif
1592}
1593
1594/**
1595 * xmlCleanupCharEncodingHandlers:
1596 *
1597 * Cleanup the memory allocated for the char encoding support, it
1598 * unregisters all the encoding handlers and the aliases.
1599 */
1600void
1601xmlCleanupCharEncodingHandlers(void) {
1602 xmlCleanupEncodingAliases();
1603
1604 if (handlers == NULL) return;
1605
1606 for (;nbCharEncodingHandler > 0;) {
1607 nbCharEncodingHandler--;
1608 if (handlers[nbCharEncodingHandler] != NULL) {
1609 if (handlers[nbCharEncodingHandler]->name != NULL)
1610 xmlFree(handlers[nbCharEncodingHandler]->name);
1611 xmlFree(handlers[nbCharEncodingHandler]);
1612 }
1613 }
1614 xmlFree(handlers);
1615 handlers = NULL;
1616 nbCharEncodingHandler = 0;
1617 xmlDefaultCharEncodingHandler = NULL;
1618}
1619
1620/**
1621 * xmlRegisterCharEncodingHandler:
1622 * @handler: the xmlCharEncodingHandlerPtr handler block
1623 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001624 * Register the char encoding handler, surprising, isn't it ?
Owen Taylor3473f882001-02-23 17:55:21 +00001625 */
1626void
1627xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1628 if (handlers == NULL) xmlInitCharEncodingHandlers();
1629 if (handler == NULL) {
1630 xmlGenericError(xmlGenericErrorContext,
1631 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1632 return;
1633 }
1634
1635 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1636 xmlGenericError(xmlGenericErrorContext,
1637 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1638 xmlGenericError(xmlGenericErrorContext,
1639 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1640 return;
1641 }
1642 handlers[nbCharEncodingHandler++] = handler;
1643}
1644
1645/**
1646 * xmlGetCharEncodingHandler:
1647 * @enc: an xmlCharEncoding value.
1648 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001649 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001650 *
1651 * Returns the handler or NULL if not found
1652 */
1653xmlCharEncodingHandlerPtr
1654xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1655 xmlCharEncodingHandlerPtr handler;
1656
1657 if (handlers == NULL) xmlInitCharEncodingHandlers();
1658 switch (enc) {
1659 case XML_CHAR_ENCODING_ERROR:
1660 return(NULL);
1661 case XML_CHAR_ENCODING_NONE:
1662 return(NULL);
1663 case XML_CHAR_ENCODING_UTF8:
1664 return(NULL);
1665 case XML_CHAR_ENCODING_UTF16LE:
1666 return(xmlUTF16LEHandler);
1667 case XML_CHAR_ENCODING_UTF16BE:
1668 return(xmlUTF16BEHandler);
1669 case XML_CHAR_ENCODING_EBCDIC:
1670 handler = xmlFindCharEncodingHandler("EBCDIC");
1671 if (handler != NULL) return(handler);
1672 handler = xmlFindCharEncodingHandler("ebcdic");
1673 if (handler != NULL) return(handler);
1674 break;
1675 case XML_CHAR_ENCODING_UCS4BE:
1676 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1677 if (handler != NULL) return(handler);
1678 handler = xmlFindCharEncodingHandler("UCS-4");
1679 if (handler != NULL) return(handler);
1680 handler = xmlFindCharEncodingHandler("UCS4");
1681 if (handler != NULL) return(handler);
1682 break;
1683 case XML_CHAR_ENCODING_UCS4LE:
1684 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1685 if (handler != NULL) return(handler);
1686 handler = xmlFindCharEncodingHandler("UCS-4");
1687 if (handler != NULL) return(handler);
1688 handler = xmlFindCharEncodingHandler("UCS4");
1689 if (handler != NULL) return(handler);
1690 break;
1691 case XML_CHAR_ENCODING_UCS4_2143:
1692 break;
1693 case XML_CHAR_ENCODING_UCS4_3412:
1694 break;
1695 case XML_CHAR_ENCODING_UCS2:
1696 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1697 if (handler != NULL) return(handler);
1698 handler = xmlFindCharEncodingHandler("UCS-2");
1699 if (handler != NULL) return(handler);
1700 handler = xmlFindCharEncodingHandler("UCS2");
1701 if (handler != NULL) return(handler);
1702 break;
1703
1704 /*
1705 * We used to keep ISO Latin encodings native in the
1706 * generated data. This led to so many problems that
1707 * this has been removed. One can still change this
1708 * back by registering no-ops encoders for those
1709 */
1710 case XML_CHAR_ENCODING_8859_1:
1711 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1712 if (handler != NULL) return(handler);
1713 break;
1714 case XML_CHAR_ENCODING_8859_2:
1715 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1716 if (handler != NULL) return(handler);
1717 break;
1718 case XML_CHAR_ENCODING_8859_3:
1719 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1720 if (handler != NULL) return(handler);
1721 break;
1722 case XML_CHAR_ENCODING_8859_4:
1723 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1724 if (handler != NULL) return(handler);
1725 break;
1726 case XML_CHAR_ENCODING_8859_5:
1727 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1728 if (handler != NULL) return(handler);
1729 break;
1730 case XML_CHAR_ENCODING_8859_6:
1731 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1732 if (handler != NULL) return(handler);
1733 break;
1734 case XML_CHAR_ENCODING_8859_7:
1735 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1736 if (handler != NULL) return(handler);
1737 break;
1738 case XML_CHAR_ENCODING_8859_8:
1739 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1740 if (handler != NULL) return(handler);
1741 break;
1742 case XML_CHAR_ENCODING_8859_9:
1743 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1744 if (handler != NULL) return(handler);
1745 break;
1746
1747
1748 case XML_CHAR_ENCODING_2022_JP:
1749 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1750 if (handler != NULL) return(handler);
1751 break;
1752 case XML_CHAR_ENCODING_SHIFT_JIS:
1753 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1754 if (handler != NULL) return(handler);
1755 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1756 if (handler != NULL) return(handler);
1757 handler = xmlFindCharEncodingHandler("Shift_JIS");
1758 if (handler != NULL) return(handler);
1759 break;
1760 case XML_CHAR_ENCODING_EUC_JP:
1761 handler = xmlFindCharEncodingHandler("EUC-JP");
1762 if (handler != NULL) return(handler);
1763 break;
1764 default:
1765 break;
1766 }
1767
1768#ifdef DEBUG_ENCODING
1769 xmlGenericError(xmlGenericErrorContext,
1770 "No handler found for encoding %d\n", enc);
1771#endif
1772 return(NULL);
1773}
1774
1775/**
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001776 * xmlFindCharEncodingHandler:
1777 * @name: a string describing the char encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001778 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001779 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001780 *
1781 * Returns the handler or NULL if not found
1782 */
1783xmlCharEncodingHandlerPtr
1784xmlFindCharEncodingHandler(const char *name) {
1785 const char *nalias;
1786 const char *norig;
1787 xmlCharEncoding alias;
1788#ifdef LIBXML_ICONV_ENABLED
1789 xmlCharEncodingHandlerPtr enc;
1790 iconv_t icv_in, icv_out;
1791#endif /* LIBXML_ICONV_ENABLED */
1792 char upper[100];
1793 int i;
1794
1795 if (handlers == NULL) xmlInitCharEncodingHandlers();
1796 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1797 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1798
1799 /*
1800 * Do the alias resolution
1801 */
1802 norig = name;
1803 nalias = xmlGetEncodingAlias(name);
1804 if (nalias != NULL)
1805 name = nalias;
1806
1807 /*
1808 * Check first for directly registered encoding names
1809 */
1810 for (i = 0;i < 99;i++) {
1811 upper[i] = toupper(name[i]);
1812 if (upper[i] == 0) break;
1813 }
1814 upper[i] = 0;
1815
1816 for (i = 0;i < nbCharEncodingHandler; i++)
1817 if (!strcmp(upper, handlers[i]->name)) {
1818#ifdef DEBUG_ENCODING
1819 xmlGenericError(xmlGenericErrorContext,
1820 "Found registered handler for encoding %s\n", name);
1821#endif
1822 return(handlers[i]);
1823 }
1824
1825#ifdef LIBXML_ICONV_ENABLED
1826 /* check whether iconv can handle this */
1827 icv_in = iconv_open("UTF-8", name);
1828 icv_out = iconv_open(name, "UTF-8");
1829 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1830 enc = (xmlCharEncodingHandlerPtr)
1831 xmlMalloc(sizeof(xmlCharEncodingHandler));
1832 if (enc == NULL) {
1833 iconv_close(icv_in);
1834 iconv_close(icv_out);
1835 return(NULL);
1836 }
1837 enc->name = xmlMemStrdup(name);
1838 enc->input = NULL;
1839 enc->output = NULL;
1840 enc->iconv_in = icv_in;
1841 enc->iconv_out = icv_out;
1842#ifdef DEBUG_ENCODING
1843 xmlGenericError(xmlGenericErrorContext,
1844 "Found iconv handler for encoding %s\n", name);
1845#endif
1846 return enc;
1847 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1848 xmlGenericError(xmlGenericErrorContext,
1849 "iconv : problems with filters for '%s'\n", name);
1850 }
1851#endif /* LIBXML_ICONV_ENABLED */
1852
1853#ifdef DEBUG_ENCODING
1854 xmlGenericError(xmlGenericErrorContext,
1855 "No handler found for encoding %s\n", name);
1856#endif
1857
1858 /*
1859 * Fallback using the canonical names
1860 */
1861 alias = xmlParseCharEncoding(norig);
1862 if (alias != XML_CHAR_ENCODING_ERROR) {
1863 const char* canon;
1864 canon = xmlGetCharEncodingName(alias);
1865 if ((canon != NULL) && (strcmp(name, canon))) {
1866 return(xmlFindCharEncodingHandler(canon));
1867 }
1868 }
1869
1870 return(NULL);
1871}
1872
Daniel Veillard97ac1312001-05-30 19:14:17 +00001873/************************************************************************
1874 * *
1875 * ICONV based generic conversion functions *
1876 * *
1877 ************************************************************************/
1878
Owen Taylor3473f882001-02-23 17:55:21 +00001879#ifdef LIBXML_ICONV_ENABLED
1880/**
1881 * xmlIconvWrapper:
1882 * @cd: iconv converter data structure
1883 * @out: a pointer to an array of bytes to store the result
1884 * @outlen: the length of @out
1885 * @in: a pointer to an array of ISO Latin 1 chars
1886 * @inlen: the length of @in
1887 *
1888 * Returns 0 if success, or
1889 * -1 by lack of space, or
1890 * -2 if the transcoding fails (for *in is not valid utf8 string or
1891 * the result of transformation can't fit into the encoding we want), or
1892 * -3 if there the last byte can't form a single output char.
1893 *
1894 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001895 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001896 * The value of @outlen after return is the number of ocetes consumed.
1897 */
1898static int
1899xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001900 unsigned char *out, int *outlen,
1901 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001902
Daniel Veillard9403a042001-05-28 11:00:53 +00001903 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1904 const char *icv_in = (const char *) in;
1905 char *icv_out = (char *) out;
1906 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001907
Darin Adler699613b2001-07-27 22:47:14 +00001908 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard9403a042001-05-28 11:00:53 +00001909 if (in != NULL) {
1910 *inlen -= icv_inlen;
1911 *outlen -= icv_outlen;
1912 } else {
1913 *inlen = 0;
1914 *outlen = 0;
1915 }
1916 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001917#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00001918 if (errno == EILSEQ) {
1919 return -2;
1920 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001921#endif
1922#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00001923 if (errno == E2BIG) {
1924 return -1;
1925 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001926#endif
1927#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00001928 if (errno == EINVAL) {
1929 return -3;
1930 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001931#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00001932 {
1933 return -3;
1934 }
1935 }
1936 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001937}
1938#endif /* LIBXML_ICONV_ENABLED */
1939
Daniel Veillard97ac1312001-05-30 19:14:17 +00001940/************************************************************************
1941 * *
1942 * The real API used by libxml for on-the-fly conversion *
1943 * *
1944 ************************************************************************/
1945
Owen Taylor3473f882001-02-23 17:55:21 +00001946/**
1947 * xmlCharEncFirstLine:
1948 * @handler: char enconding transformation data structure
1949 * @out: an xmlBuffer for the output.
1950 * @in: an xmlBuffer for the input
1951 *
1952 * Front-end for the encoding handler input function, but handle only
1953 * the very first line, i.e. limit itself to 45 chars.
1954 *
1955 * Returns the number of byte written if success, or
1956 * -1 general error
1957 * -2 if the transcoding fails (for *in is not valid utf8 string or
1958 * the result of transformation can't fit into the encoding we want), or
1959 */
1960int
1961xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1962 xmlBufferPtr in) {
1963 int ret = -2;
1964 int written;
1965 int toconv;
1966
1967 if (handler == NULL) return(-1);
1968 if (out == NULL) return(-1);
1969 if (in == NULL) return(-1);
1970
1971 written = out->size - out->use;
1972 toconv = in->use;
1973 if (toconv * 2 >= written) {
1974 xmlBufferGrow(out, toconv);
1975 written = out->size - out->use - 1;
1976 }
1977
1978 /*
1979 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1980 * 45 chars should be sufficient to reach the end of the encoding
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001981 * declaration without going too far inside the document content.
Owen Taylor3473f882001-02-23 17:55:21 +00001982 */
1983 written = 45;
1984
1985 if (handler->input != NULL) {
1986 ret = handler->input(&out->content[out->use], &written,
1987 in->content, &toconv);
1988 xmlBufferShrink(in, toconv);
1989 out->use += written;
1990 out->content[out->use] = 0;
1991 }
1992#ifdef LIBXML_ICONV_ENABLED
1993 else if (handler->iconv_in != NULL) {
1994 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1995 &written, in->content, &toconv);
1996 xmlBufferShrink(in, toconv);
1997 out->use += written;
1998 out->content[out->use] = 0;
1999 if (ret == -1) ret = -3;
2000 }
2001#endif /* LIBXML_ICONV_ENABLED */
2002#ifdef DEBUG_ENCODING
2003 switch (ret) {
2004 case 0:
2005 xmlGenericError(xmlGenericErrorContext,
2006 "converted %d bytes to %d bytes of input\n",
2007 toconv, written);
2008 break;
2009 case -1:
2010 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2011 toconv, written, in->use);
2012 break;
2013 case -2:
2014 xmlGenericError(xmlGenericErrorContext,
2015 "input conversion failed due to input error\n");
2016 break;
2017 case -3:
2018 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2019 toconv, written, in->use);
2020 break;
2021 default:
2022 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2023 }
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002024#endif /* DEBUG_ENCODING */
Owen Taylor3473f882001-02-23 17:55:21 +00002025 /*
2026 * Ignore when input buffer is not on a boundary
2027 */
2028 if (ret == -3) ret = 0;
2029 if (ret == -1) ret = 0;
2030 return(ret);
2031}
2032
2033/**
2034 * xmlCharEncInFunc:
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002035 * @handler: char encoding transformation data structure
Owen Taylor3473f882001-02-23 17:55:21 +00002036 * @out: an xmlBuffer for the output.
2037 * @in: an xmlBuffer for the input
2038 *
2039 * Generic front-end for the encoding handler input function
2040 *
2041 * Returns the number of byte written if success, or
2042 * -1 general error
2043 * -2 if the transcoding fails (for *in is not valid utf8 string or
2044 * the result of transformation can't fit into the encoding we want), or
2045 */
2046int
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002047xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2048 xmlBufferPtr in)
2049{
Owen Taylor3473f882001-02-23 17:55:21 +00002050 int ret = -2;
2051 int written;
2052 int toconv;
2053
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002054 if (handler == NULL)
2055 return (-1);
2056 if (out == NULL)
2057 return (-1);
2058 if (in == NULL)
2059 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00002060
2061 toconv = in->use;
2062 if (toconv == 0)
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002063 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00002064 written = out->size - out->use;
2065 if (toconv * 2 >= written) {
2066 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002067 written = out->size - out->use - 1;
Owen Taylor3473f882001-02-23 17:55:21 +00002068 }
2069 if (handler->input != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002070 ret = handler->input(&out->content[out->use], &written,
2071 in->content, &toconv);
2072 xmlBufferShrink(in, toconv);
2073 out->use += written;
2074 out->content[out->use] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002075 }
2076#ifdef LIBXML_ICONV_ENABLED
2077 else if (handler->iconv_in != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002078 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2079 &written, in->content, &toconv);
2080 xmlBufferShrink(in, toconv);
2081 out->use += written;
2082 out->content[out->use] = 0;
2083 if (ret == -1)
2084 ret = -3;
Owen Taylor3473f882001-02-23 17:55:21 +00002085 }
2086#endif /* LIBXML_ICONV_ENABLED */
2087 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002088 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002089#ifdef DEBUG_ENCODING
2090 xmlGenericError(xmlGenericErrorContext,
2091 "converted %d bytes to %d bytes of input\n",
2092 toconv, written);
Owen Taylor3473f882001-02-23 17:55:21 +00002093#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002094 break;
2095 case -1:
2096#ifdef DEBUG_ENCODING
2097 xmlGenericError(xmlGenericErrorContext,
2098 "converted %d bytes to %d bytes of input, %d left\n",
2099 toconv, written, in->use);
2100#endif
2101 break;
2102 case -3:
2103#ifdef DEBUG_ENCODING
2104 xmlGenericError(xmlGenericErrorContext,
2105 "converted %d bytes to %d bytes of input, %d left\n",
2106 toconv, written, in->use);
2107#endif
2108 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002109 case -2:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002110 xmlGenericError(xmlGenericErrorContext,
2111 "input conversion failed due to input error\n");
2112 xmlGenericError(xmlGenericErrorContext,
2113 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2114 in->content[0], in->content[1],
2115 in->content[2], in->content[3]);
Owen Taylor3473f882001-02-23 17:55:21 +00002116 }
2117 /*
2118 * Ignore when input buffer is not on a boundary
2119 */
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002120 if (ret == -3)
2121 ret = 0;
2122 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00002123}
2124
2125/**
2126 * xmlCharEncOutFunc:
2127 * @handler: char enconding transformation data structure
2128 * @out: an xmlBuffer for the output.
2129 * @in: an xmlBuffer for the input
2130 *
2131 * Generic front-end for the encoding handler output function
2132 * a first call with @in == NULL has to be made firs to initiate the
2133 * output in case of non-stateless encoding needing to initiate their
2134 * state or the output (like the BOM in UTF16).
2135 * In case of UTF8 sequence conversion errors for the given encoder,
2136 * the content will be automatically remapped to a CharRef sequence.
2137 *
2138 * Returns the number of byte written if success, or
2139 * -1 general error
2140 * -2 if the transcoding fails (for *in is not valid utf8 string or
2141 * the result of transformation can't fit into the encoding we want), or
2142 */
2143int
2144xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2145 xmlBufferPtr in) {
2146 int ret = -2;
2147 int written;
2148 int writtentot = 0;
2149 int toconv;
2150 int output = 0;
2151
2152 if (handler == NULL) return(-1);
2153 if (out == NULL) return(-1);
2154
2155retry:
2156
2157 written = out->size - out->use;
2158
2159 /*
2160 * First specific handling of in = NULL, i.e. the initialization call
2161 */
2162 if (in == NULL) {
2163 toconv = 0;
2164 if (handler->output != NULL) {
2165 ret = handler->output(&out->content[out->use], &written,
2166 NULL, &toconv);
2167 out->use += written;
2168 out->content[out->use] = 0;
2169 }
2170#ifdef LIBXML_ICONV_ENABLED
2171 else if (handler->iconv_out != NULL) {
2172 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2173 &written, NULL, &toconv);
2174 out->use += written;
2175 out->content[out->use] = 0;
2176 }
2177#endif /* LIBXML_ICONV_ENABLED */
2178#ifdef DEBUG_ENCODING
2179 xmlGenericError(xmlGenericErrorContext,
2180 "initialized encoder\n");
2181#endif
2182 return(0);
2183 }
2184
2185 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002186 * Conversion itself.
Owen Taylor3473f882001-02-23 17:55:21 +00002187 */
2188 toconv = in->use;
2189 if (toconv == 0)
2190 return(0);
2191 if (toconv * 2 >= written) {
2192 xmlBufferGrow(out, toconv * 2);
2193 written = out->size - out->use - 1;
2194 }
2195 if (handler->output != NULL) {
2196 ret = handler->output(&out->content[out->use], &written,
2197 in->content, &toconv);
2198 xmlBufferShrink(in, toconv);
2199 out->use += written;
2200 writtentot += written;
2201 out->content[out->use] = 0;
2202 }
2203#ifdef LIBXML_ICONV_ENABLED
2204 else if (handler->iconv_out != NULL) {
2205 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2206 &written, in->content, &toconv);
2207 xmlBufferShrink(in, toconv);
2208 out->use += written;
2209 writtentot += written;
2210 out->content[out->use] = 0;
2211 if (ret == -1) {
2212 if (written > 0) {
2213 /*
2214 * Can be a limitation of iconv
2215 */
2216 goto retry;
2217 }
2218 ret = -3;
2219 }
2220 }
2221#endif /* LIBXML_ICONV_ENABLED */
2222 else {
2223 xmlGenericError(xmlGenericErrorContext,
2224 "xmlCharEncOutFunc: no output function !\n");
2225 return(-1);
2226 }
2227
2228 if (ret >= 0) output += ret;
2229
2230 /*
2231 * Attempt to handle error cases
2232 */
2233 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002234 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002235#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002236 xmlGenericError(xmlGenericErrorContext,
2237 "converted %d bytes to %d bytes of output\n",
2238 toconv, written);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002239#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002240 break;
2241 case -1:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002242#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002243 xmlGenericError(xmlGenericErrorContext,
2244 "output conversion failed by lack of space\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002245#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002246 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002247 case -3:
2248 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2249 toconv, written, in->use);
2250 break;
2251 case -2: {
2252 int len = in->use;
2253 const xmlChar *utf = (const xmlChar *) in->content;
2254 int cur;
2255
2256 cur = xmlGetUTF8Char(utf, &len);
2257 if (cur > 0) {
2258 xmlChar charref[20];
2259
2260#ifdef DEBUG_ENCODING
2261 xmlGenericError(xmlGenericErrorContext,
2262 "handling output conversion error\n");
2263 xmlGenericError(xmlGenericErrorContext,
2264 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2265 in->content[0], in->content[1],
2266 in->content[2], in->content[3]);
2267#endif
2268 /*
2269 * Removes the UTF8 sequence, and replace it by a charref
2270 * and continue the transcoding phase, hoping the error
2271 * did not mangle the encoder state.
2272 */
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002273 snprintf((char *) charref, sizeof(charref), "&#%d;", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002274 xmlBufferShrink(in, len);
2275 xmlBufferAddHead(in, charref, -1);
2276
2277 goto retry;
2278 } else {
2279 xmlGenericError(xmlGenericErrorContext,
2280 "output conversion failed due to conv error\n");
2281 xmlGenericError(xmlGenericErrorContext,
2282 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2283 in->content[0], in->content[1],
2284 in->content[2], in->content[3]);
2285 in->content[0] = ' ';
2286 }
2287 break;
2288 }
2289 }
2290 return(ret);
2291}
2292
2293/**
2294 * xmlCharEncCloseFunc:
2295 * @handler: char enconding transformation data structure
2296 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002297 * Generic front-end for encoding handler close function
Owen Taylor3473f882001-02-23 17:55:21 +00002298 *
2299 * Returns 0 if success, or -1 in case of error
2300 */
2301int
2302xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2303 int ret = 0;
2304 if (handler == NULL) return(-1);
2305 if (handler->name == NULL) return(-1);
2306#ifdef LIBXML_ICONV_ENABLED
2307 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002308 * Iconv handlers can be used only once, free the whole block.
Owen Taylor3473f882001-02-23 17:55:21 +00002309 * and the associated icon resources.
2310 */
2311 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2312 if (handler->name != NULL)
2313 xmlFree(handler->name);
2314 handler->name = NULL;
2315 if (handler->iconv_out != NULL) {
2316 if (iconv_close(handler->iconv_out))
2317 ret = -1;
2318 handler->iconv_out = NULL;
2319 }
2320 if (handler->iconv_in != NULL) {
2321 if (iconv_close(handler->iconv_in))
2322 ret = -1;
2323 handler->iconv_in = NULL;
2324 }
2325 xmlFree(handler);
2326 }
2327#endif /* LIBXML_ICONV_ENABLED */
2328#ifdef DEBUG_ENCODING
2329 if (ret)
2330 xmlGenericError(xmlGenericErrorContext,
2331 "failed to close the encoding handler\n");
2332 else
2333 xmlGenericError(xmlGenericErrorContext,
2334 "closed the encoding handler\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002335#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002336
Owen Taylor3473f882001-02-23 17:55:21 +00002337 return(ret);
2338}
2339