blob: 8e2397a2d4730ef8f6a421553bae35f92786b5b8 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
Daniel Veillardc5d64342001-06-24 12:13:24 +000018 * daniel@veillard.com
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Daniel Veillard34ce8be2002-03-18 19:37:11 +000026#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000027#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000028
Owen Taylor3473f882001-02-23 17:55:21 +000029#include <string.h>
30
31#ifdef HAVE_CTYPE_H
32#include <ctype.h>
33#endif
34#ifdef HAVE_STDLIB_H
35#include <stdlib.h>
36#endif
Owen Taylor3473f882001-02-23 17:55:21 +000037#ifdef LIBXML_ICONV_ENABLED
38#ifdef HAVE_ERRNO_H
39#include <errno.h>
40#endif
41#endif
42#include <libxml/encoding.h>
43#include <libxml/xmlmemory.h>
44#ifdef LIBXML_HTML_ENABLED
45#include <libxml/HTMLparser.h>
46#endif
Daniel Veillard64a411c2001-10-15 12:32:07 +000047#include <libxml/globals.h>
Daniel Veillarda4617b82001-11-04 20:19:12 +000048#include <libxml/xmlerror.h>
Owen Taylor3473f882001-02-23 17:55:21 +000049
Daniel Veillard22090732001-07-16 00:06:07 +000050static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +000052
53typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
54typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
55struct _xmlCharEncodingAlias {
56 const char *name;
57 const char *alias;
58};
59
60static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
61static int xmlCharEncodingAliasesNb = 0;
62static int xmlCharEncodingAliasesMax = 0;
63
64#ifdef LIBXML_ICONV_ENABLED
65#if 0
66#define DEBUG_ENCODING /* Define this to get encoding traces */
67#endif
68#endif
69
70static int xmlLittleEndian = 1;
71
Daniel Veillard97ac1312001-05-30 19:14:17 +000072/************************************************************************
73 * *
74 * Generic UTF8 handling routines *
75 * *
76 * From rfc2044: encoding of the Unicode values on UTF-8: *
77 * *
78 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
79 * 0000 0000-0000 007F 0xxxxxxx *
80 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
81 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
82 * *
83 * I hope we won't use values > 0xFFFF anytime soon ! *
84 * *
85 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000086
87/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000088 * xmlUTF8Strlen:
89 * @utf: a sequence of UTF-8 encoded bytes
90 *
Daniel Veillard60087f32001-10-10 09:45:09 +000091 * compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillarde043ee12001-04-16 14:08:07 +000092 * checking of the content of the string.
93 *
94 * Returns the number of characters in the string or -1 in case of error
95 */
96int
Daniel Veillard97ac1312001-05-30 19:14:17 +000097xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +000098 int ret = 0;
99
100 if (utf == NULL)
101 return(-1);
102
103 while (*utf != 0) {
104 if (utf[0] & 0x80) {
105 if ((utf[1] & 0xc0) != 0x80)
106 return(-1);
107 if ((utf[0] & 0xe0) == 0xe0) {
108 if ((utf[2] & 0xc0) != 0x80)
109 return(-1);
110 if ((utf[0] & 0xf0) == 0xf0) {
111 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
112 return(-1);
113 utf += 4;
114 } else {
115 utf += 3;
116 }
117 } else {
118 utf += 2;
119 }
120 } else {
121 utf++;
122 }
123 ret++;
124 }
125 return(ret);
126}
127
128/**
Owen Taylor3473f882001-02-23 17:55:21 +0000129 * xmlGetUTF8Char:
130 * @utf: a sequence of UTF-8 encoded bytes
131 * @len: a pointer to @bytes len
132 *
133 * Read one UTF8 Char from @utf
134 *
135 * Returns the char value or -1 in case of error and update @len with the
136 * number of bytes used
137 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000138static int
Owen Taylor3473f882001-02-23 17:55:21 +0000139xmlGetUTF8Char(const unsigned char *utf, int *len) {
140 unsigned int c;
141
142 if (utf == NULL)
143 goto error;
144 if (len == NULL)
145 goto error;
146 if (*len < 1)
147 goto error;
148
149 c = utf[0];
150 if (c & 0x80) {
151 if (*len < 2)
152 goto error;
153 if ((utf[1] & 0xc0) != 0x80)
154 goto error;
155 if ((c & 0xe0) == 0xe0) {
156 if (*len < 3)
157 goto error;
158 if ((utf[2] & 0xc0) != 0x80)
159 goto error;
160 if ((c & 0xf0) == 0xf0) {
161 if (*len < 4)
162 goto error;
163 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
164 goto error;
165 *len = 4;
166 /* 4-byte code */
167 c = (utf[0] & 0x7) << 18;
168 c |= (utf[1] & 0x3f) << 12;
169 c |= (utf[2] & 0x3f) << 6;
170 c |= utf[3] & 0x3f;
171 } else {
172 /* 3-byte code */
173 *len = 3;
174 c = (utf[0] & 0xf) << 12;
175 c |= (utf[1] & 0x3f) << 6;
176 c |= utf[2] & 0x3f;
177 }
178 } else {
179 /* 2-byte code */
180 *len = 2;
181 c = (utf[0] & 0x1f) << 6;
182 c |= utf[1] & 0x3f;
183 }
184 } else {
185 /* 1-byte code */
186 *len = 1;
187 }
188 return(c);
189
190error:
191 *len = 0;
192 return(-1);
193}
194
195/**
196 * xmlCheckUTF8: Check utf-8 string for legality.
197 * @utf: Pointer to putative utf-8 encoded string.
198 *
199 * Checks @utf for being valid utf-8. @utf is assumed to be
200 * null-terminated. This function is not super-strict, as it will
201 * allow longer utf-8 sequences than necessary. Note that Java is
202 * capable of producing these sequences if provoked. Also note, this
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * routine checks for the 4-byte maximum size, but does not check for
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * 0x10ffff maximum value.
205 *
206 * Return value: true if @utf is valid.
207 **/
208int
209xmlCheckUTF8(const unsigned char *utf)
210{
211 int ix;
212 unsigned char c;
213
214 for (ix = 0; (c = utf[ix]);) {
215 if (c & 0x80) {
216 if ((utf[ix + 1] & 0xc0) != 0x80)
217 return(0);
218 if ((c & 0xe0) == 0xe0) {
219 if ((utf[ix + 2] & 0xc0) != 0x80)
220 return(0);
221 if ((c & 0xf0) == 0xf0) {
222 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
223 return(0);
224 ix += 4;
225 /* 4-byte code */
226 } else
227 /* 3-byte code */
228 ix += 3;
229 } else
230 /* 2-byte code */
231 ix += 2;
232 } else
233 /* 1-byte code */
234 ix++;
235 }
236 return(1);
237}
238
239/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000240 * xmlUTF8Strsize:
241 * @utf: a sequence of UTF-8 encoded bytes
242 * @len: the number of characters in the array
243 *
244 * storage size of an UTF8 string
245 *
246 * Returns the storage size of
247 * the first 'len' characters of ARRAY
248 *
249 */
250
251int
252xmlUTF8Strsize(const xmlChar *utf, int len) {
253 const xmlChar *ptr=utf;
254 xmlChar ch;
255
256 if (len <= 0)
257 return(0);
258
259 while ( len-- > 0) {
260 if ( !*ptr )
261 break;
262 if ( (ch = *ptr++) & 0x80)
263 while ( (ch<<=1) & 0x80 )
264 ptr++;
265 }
266 return (ptr - utf);
267}
268
269
270/**
271 * xmlUTF8Strndup:
272 * @utf: the input UTF8 *
273 * @len: the len of @utf (in chars)
274 *
275 * a strndup for array of UTF8's
276 *
277 * Returns a new UTF8 * or NULL
278 */
279xmlChar *
280xmlUTF8Strndup(const xmlChar *utf, int len) {
281 xmlChar *ret;
282 int i;
283
284 if ((utf == NULL) || (len < 0)) return(NULL);
285 i = xmlUTF8Strsize(utf, len);
286 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
287 if (ret == NULL) {
288 xmlGenericError(xmlGenericErrorContext,
289 "malloc of %ld byte failed\n",
290 (len + 1) * (long)sizeof(xmlChar));
291 return(NULL);
292 }
293 memcpy(ret, utf, i * sizeof(xmlChar));
294 ret[i] = 0;
295 return(ret);
296}
297
298/**
299 * xmlUTF8Strpos:
300 * @utf: the input UTF8 *
301 * @pos: the position of the desired UTF8 char (in chars)
302 *
303 * a function to provide the equivalent of fetching a
304 * character from a string array
305 *
306 * Returns a pointer to the UTF8 character or NULL
307 */
308xmlChar *
309xmlUTF8Strpos(const xmlChar *utf, int pos) {
310 xmlChar ch;
311
312 if (utf == NULL) return(NULL);
313 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
314 return(NULL);
315 while (pos--) {
316 if ((ch=*utf++) == 0) return(NULL);
317 if ( ch & 0x80 ) {
318 /* if not simple ascii, verify proper format */
319 if ( (ch & 0xc0) != 0xc0 )
320 return(NULL);
321 /* then skip over remaining bytes for this char */
322 while ( (ch <<= 1) & 0x80 )
323 if ( (*utf++ & 0xc0) != 0x80 )
324 return(NULL);
325 }
326 }
327 return((xmlChar *)utf);
328}
329
330/**
331 * xmlUTF8Strloc:
332 * @utf: the input UTF8 *
333 * @utfchar: the UTF8 character to be found
334 *
335 * a function to provide relative location of a UTF8 char
336 *
337 * Returns the relative character position of the desired char
338 * or -1 if not found
339 */
340int
341xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
342 int i, size;
343 xmlChar ch;
344
345 if (utf==NULL || utfchar==NULL) return -1;
346 size = xmlUTF8Strsize(utfchar, 1);
347 for(i=0; (ch=*utf) != 0; i++) {
348 if (xmlStrncmp(utf, utfchar, size)==0)
349 return(i);
350 utf++;
351 if ( ch & 0x80 ) {
352 /* if not simple ascii, verify proper format */
353 if ( (ch & 0xc0) != 0xc0 )
354 return(-1);
355 /* then skip over remaining bytes for this char */
356 while ( (ch <<= 1) & 0x80 )
357 if ( (*utf++ & 0xc0) != 0x80 )
358 return(-1);
359 }
360 }
361
362 return(-1);
363}
364/**
365 * xmlUTF8Strsub:
366 * @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard97ac1312001-05-30 19:14:17 +0000367 * @start: relative pos of first char
368 * @len: total number to copy
369 *
370 * Note: positions are given in units of UTF-8 chars
371 *
372 * Returns a pointer to a newly created string
373 * or NULL if any problem
374 */
375
376xmlChar *
377xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
378 int i;
379 xmlChar ch;
380
381 if (utf == NULL) return(NULL);
382 if (start < 0) return(NULL);
383 if (len < 0) return(NULL);
384
385 /*
386 * Skip over any leading chars
387 */
388 for (i = 0;i < start;i++) {
389 if ((ch=*utf++) == 0) return(NULL);
390 if ( ch & 0x80 ) {
391 /* if not simple ascii, verify proper format */
392 if ( (ch & 0xc0) != 0xc0 )
393 return(NULL);
394 /* then skip over remaining bytes for this char */
395 while ( (ch <<= 1) & 0x80 )
396 if ( (*utf++ & 0xc0) != 0x80 )
397 return(NULL);
398 }
399 }
400
401 return(xmlUTF8Strndup(utf, len));
402}
403
404/************************************************************************
405 * *
406 * Conversions To/From UTF8 encoding *
407 * *
408 ************************************************************************/
409
410/**
Owen Taylor3473f882001-02-23 17:55:21 +0000411 * asciiToUTF8:
412 * @out: a pointer to an array of bytes to store the result
413 * @outlen: the length of @out
414 * @in: a pointer to an array of ASCII chars
415 * @inlen: the length of @in
416 *
417 * Take a block of ASCII chars in and try to convert it to an UTF-8
418 * block of chars out.
419 * Returns 0 if success, or -1 otherwise
420 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000421 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000422 * The value of @outlen after return is the number of ocetes consumed.
423 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000424static int
Owen Taylor3473f882001-02-23 17:55:21 +0000425asciiToUTF8(unsigned char* out, int *outlen,
426 const unsigned char* in, int *inlen) {
427 unsigned char* outstart = out;
428 const unsigned char* base = in;
429 const unsigned char* processed = in;
430 unsigned char* outend = out + *outlen;
431 const unsigned char* inend;
432 unsigned int c;
433 int bits;
434
435 inend = in + (*inlen);
436 while ((in < inend) && (out - outstart + 5 < *outlen)) {
437 c= *in++;
438
439 /* assertion: c is a single UTF-4 value */
440 if (out >= outend)
441 break;
442 if (c < 0x80) { *out++= c; bits= -6; }
443 else {
444 *outlen = out - outstart;
445 *inlen = processed - base;
446 return(-1);
447 }
448
449 for ( ; bits >= 0; bits-= 6) {
450 if (out >= outend)
451 break;
452 *out++= ((c >> bits) & 0x3F) | 0x80;
453 }
454 processed = (const unsigned char*) in;
455 }
456 *outlen = out - outstart;
457 *inlen = processed - base;
458 return(0);
459}
460
461/**
462 * UTF8Toascii:
463 * @out: a pointer to an array of bytes to store the result
464 * @outlen: the length of @out
465 * @in: a pointer to an array of UTF-8 chars
466 * @inlen: the length of @in
467 *
468 * Take a block of UTF-8 chars in and try to convert it to an ASCII
469 * block of chars out.
470 *
471 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
472 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000473 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000474 * The value of @outlen after return is the number of ocetes consumed.
475 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000476static int
Owen Taylor3473f882001-02-23 17:55:21 +0000477UTF8Toascii(unsigned char* out, int *outlen,
478 const unsigned char* in, int *inlen) {
479 const unsigned char* processed = in;
480 const unsigned char* outend;
481 const unsigned char* outstart = out;
482 const unsigned char* instart = in;
483 const unsigned char* inend;
484 unsigned int c, d;
485 int trailing;
486
487 if (in == NULL) {
488 /*
489 * initialization nothing to do
490 */
491 *outlen = 0;
492 *inlen = 0;
493 return(0);
494 }
495 inend = in + (*inlen);
496 outend = out + (*outlen);
497 while (in < inend) {
498 d = *in++;
499 if (d < 0x80) { c= d; trailing= 0; }
500 else if (d < 0xC0) {
501 /* trailing byte in leading position */
502 *outlen = out - outstart;
503 *inlen = processed - instart;
504 return(-2);
505 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
506 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
507 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
508 else {
509 /* no chance for this in Ascii */
510 *outlen = out - outstart;
511 *inlen = processed - instart;
512 return(-2);
513 }
514
515 if (inend - in < trailing) {
516 break;
517 }
518
519 for ( ; trailing; trailing--) {
520 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
521 break;
522 c <<= 6;
523 c |= d & 0x3F;
524 }
525
526 /* assertion: c is a single UTF-4 value */
527 if (c < 0x80) {
528 if (out >= outend)
529 break;
530 *out++ = c;
531 } else {
532 /* no chance for this in Ascii */
533 *outlen = out - outstart;
534 *inlen = processed - instart;
535 return(-2);
536 }
537 processed = in;
538 }
539 *outlen = out - outstart;
540 *inlen = processed - instart;
541 return(0);
542}
543
544/**
545 * isolat1ToUTF8:
546 * @out: a pointer to an array of bytes to store the result
547 * @outlen: the length of @out
548 * @in: a pointer to an array of ISO Latin 1 chars
549 * @inlen: the length of @in
550 *
551 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
552 * block of chars out.
553 * Returns 0 if success, or -1 otherwise
554 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000555 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000556 * The value of @outlen after return is the number of ocetes consumed.
557 */
558int
559isolat1ToUTF8(unsigned char* out, int *outlen,
560 const unsigned char* in, int *inlen) {
561 unsigned char* outstart = out;
562 const unsigned char* base = in;
563 const unsigned char* processed = in;
564 unsigned char* outend = out + *outlen;
565 const unsigned char* inend;
566 unsigned int c;
Owen Taylor3473f882001-02-23 17:55:21 +0000567
568 inend = in + (*inlen);
Daniel Veillard02141ea2001-04-30 11:46:40 +0000569 while (in < inend) {
570 c = *in++;
Owen Taylor3473f882001-02-23 17:55:21 +0000571
Owen Taylor3473f882001-02-23 17:55:21 +0000572 if (out >= outend)
573 break;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000574
575 if (c < 0x80) {
576 *out++ = c;
577 processed++;
578 continue;
579 } else {
Daniel Veillarddb552912002-03-21 13:27:59 +0000580 /*
581 * make sure there is 2 chars left in advance
582 */
583 if (out + 1 >= outend) {
Daniel Veillard02141ea2001-04-30 11:46:40 +0000584 break;
Daniel Veillarddb552912002-03-21 13:27:59 +0000585 }
586 *out++= ((c >> 6) & 0x1F) | 0xC0;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000587 *out++= (c & 0x3F) | 0x80;
588 processed++;
Owen Taylor3473f882001-02-23 17:55:21 +0000589 }
Owen Taylor3473f882001-02-23 17:55:21 +0000590 }
591 *outlen = out - outstart;
592 *inlen = processed - base;
593 return(0);
594}
595
596/**
597 * UTF8Toisolat1:
598 * @out: a pointer to an array of bytes to store the result
599 * @outlen: the length of @out
600 * @in: a pointer to an array of UTF-8 chars
601 * @inlen: the length of @in
602 *
603 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
604 * block of chars out.
605 *
606 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
607 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000608 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000609 * The value of @outlen after return is the number of ocetes consumed.
610 */
611int
612UTF8Toisolat1(unsigned char* out, int *outlen,
613 const unsigned char* in, int *inlen) {
614 const unsigned char* processed = in;
615 const unsigned char* outend;
616 const unsigned char* outstart = out;
617 const unsigned char* instart = in;
618 const unsigned char* inend;
619 unsigned int c, d;
620 int trailing;
621
622 if (in == NULL) {
623 /*
624 * initialization nothing to do
625 */
626 *outlen = 0;
627 *inlen = 0;
628 return(0);
629 }
630 inend = in + (*inlen);
631 outend = out + (*outlen);
632 while (in < inend) {
633 d = *in++;
634 if (d < 0x80) { c= d; trailing= 0; }
635 else if (d < 0xC0) {
636 /* trailing byte in leading position */
637 *outlen = out - outstart;
638 *inlen = processed - instart;
639 return(-2);
640 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
641 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
642 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
643 else {
644 /* no chance for this in IsoLat1 */
645 *outlen = out - outstart;
646 *inlen = processed - instart;
647 return(-2);
648 }
649
650 if (inend - in < trailing) {
651 break;
652 }
653
654 for ( ; trailing; trailing--) {
655 if (in >= inend)
656 break;
657 if (((d= *in++) & 0xC0) != 0x80) {
658 *outlen = out - outstart;
659 *inlen = processed - instart;
660 return(-2);
661 }
662 c <<= 6;
663 c |= d & 0x3F;
664 }
665
666 /* assertion: c is a single UTF-4 value */
667 if (c <= 0xFF) {
668 if (out >= outend)
669 break;
670 *out++ = c;
671 } else {
672 /* no chance for this in IsoLat1 */
673 *outlen = out - outstart;
674 *inlen = processed - instart;
675 return(-2);
676 }
677 processed = in;
678 }
679 *outlen = out - outstart;
680 *inlen = processed - instart;
681 return(0);
682}
683
684/**
685 * UTF16LEToUTF8:
686 * @out: a pointer to an array of bytes to store the result
687 * @outlen: the length of @out
688 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
689 * @inlenb: the length of @in in UTF-16LE chars
690 *
691 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000692 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000693 * is the same between the native type of this machine and the
694 * inputed one.
695 *
696 * Returns the number of byte written, or -1 by lack of space, or -2
697 * if the transcoding fails (for *in is not valid utf16 string)
698 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000699 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000700 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000701static int
Owen Taylor3473f882001-02-23 17:55:21 +0000702UTF16LEToUTF8(unsigned char* out, int *outlen,
703 const unsigned char* inb, int *inlenb)
704{
705 unsigned char* outstart = out;
706 const unsigned char* processed = inb;
707 unsigned char* outend = out + *outlen;
708 unsigned short* in = (unsigned short*) inb;
709 unsigned short* inend;
710 unsigned int c, d, inlen;
711 unsigned char *tmp;
712 int bits;
713
714 if ((*inlenb % 2) == 1)
715 (*inlenb)--;
716 inlen = *inlenb / 2;
717 inend = in + inlen;
718 while ((in < inend) && (out - outstart + 5 < *outlen)) {
719 if (xmlLittleEndian) {
720 c= *in++;
721 } else {
722 tmp = (unsigned char *) in;
723 c = *tmp++;
724 c = c | (((unsigned int)*tmp) << 8);
725 in++;
726 }
727 if ((c & 0xFC00) == 0xD800) { /* surrogates */
728 if (in >= inend) { /* (in > inend) shouldn't happens */
729 break;
730 }
731 if (xmlLittleEndian) {
732 d = *in++;
733 } else {
734 tmp = (unsigned char *) in;
735 d = *tmp++;
736 d = d | (((unsigned int)*tmp) << 8);
737 in++;
738 }
739 if ((d & 0xFC00) == 0xDC00) {
740 c &= 0x03FF;
741 c <<= 10;
742 c |= d & 0x03FF;
743 c += 0x10000;
744 }
745 else {
746 *outlen = out - outstart;
747 *inlenb = processed - inb;
748 return(-2);
749 }
750 }
751
752 /* assertion: c is a single UTF-4 value */
753 if (out >= outend)
754 break;
755 if (c < 0x80) { *out++= c; bits= -6; }
756 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
757 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
758 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
759
760 for ( ; bits >= 0; bits-= 6) {
761 if (out >= outend)
762 break;
763 *out++= ((c >> bits) & 0x3F) | 0x80;
764 }
765 processed = (const unsigned char*) in;
766 }
767 *outlen = out - outstart;
768 *inlenb = processed - inb;
769 return(0);
770}
771
772/**
773 * UTF8ToUTF16LE:
774 * @outb: a pointer to an array of bytes to store the result
775 * @outlen: the length of @outb
776 * @in: a pointer to an array of UTF-8 chars
777 * @inlen: the length of @in
778 *
779 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
780 * block of chars out.
781 *
782 * Returns the number of byte written, or -1 by lack of space, or -2
783 * if the transcoding failed.
784 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000785static int
Owen Taylor3473f882001-02-23 17:55:21 +0000786UTF8ToUTF16LE(unsigned char* outb, int *outlen,
787 const unsigned char* in, int *inlen)
788{
789 unsigned short* out = (unsigned short*) outb;
790 const unsigned char* processed = in;
791 unsigned short* outstart= out;
792 unsigned short* outend;
793 const unsigned char* inend= in+*inlen;
794 unsigned int c, d;
795 int trailing;
796 unsigned char *tmp;
797 unsigned short tmp1, tmp2;
798
799 if (in == NULL) {
800 /*
801 * initialization, add the Byte Order Mark
802 */
803 if (*outlen >= 2) {
804 outb[0] = 0xFF;
805 outb[1] = 0xFE;
806 *outlen = 2;
807 *inlen = 0;
808#ifdef DEBUG_ENCODING
809 xmlGenericError(xmlGenericErrorContext,
810 "Added FFFE Byte Order Mark\n");
811#endif
812 return(2);
813 }
814 *outlen = 0;
815 *inlen = 0;
816 return(0);
817 }
818 outend = out + (*outlen / 2);
819 while (in < inend) {
820 d= *in++;
821 if (d < 0x80) { c= d; trailing= 0; }
822 else if (d < 0xC0) {
823 /* trailing byte in leading position */
824 *outlen = (out - outstart) * 2;
825 *inlen = processed - in;
826 return(-2);
827 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
828 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
829 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
830 else {
831 /* no chance for this in UTF-16 */
832 *outlen = (out - outstart) * 2;
833 *inlen = processed - in;
834 return(-2);
835 }
836
837 if (inend - in < trailing) {
838 break;
839 }
840
841 for ( ; trailing; trailing--) {
842 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
843 break;
844 c <<= 6;
845 c |= d & 0x3F;
846 }
847
848 /* assertion: c is a single UTF-4 value */
849 if (c < 0x10000) {
850 if (out >= outend)
851 break;
852 if (xmlLittleEndian) {
853 *out++ = c;
854 } else {
855 tmp = (unsigned char *) out;
856 *tmp = c ;
857 *(tmp + 1) = c >> 8 ;
858 out++;
859 }
860 }
861 else if (c < 0x110000) {
862 if (out+1 >= outend)
863 break;
864 c -= 0x10000;
865 if (xmlLittleEndian) {
866 *out++ = 0xD800 | (c >> 10);
867 *out++ = 0xDC00 | (c & 0x03FF);
868 } else {
869 tmp1 = 0xD800 | (c >> 10);
870 tmp = (unsigned char *) out;
871 *tmp = (unsigned char) tmp1;
872 *(tmp + 1) = tmp1 >> 8;
873 out++;
874
875 tmp2 = 0xDC00 | (c & 0x03FF);
876 tmp = (unsigned char *) out;
877 *tmp = (unsigned char) tmp2;
878 *(tmp + 1) = tmp2 >> 8;
879 out++;
880 }
881 }
882 else
883 break;
884 processed = in;
885 }
886 *outlen = (out - outstart) * 2;
887 *inlen = processed - in;
888 return(0);
889}
890
891/**
892 * UTF16BEToUTF8:
893 * @out: a pointer to an array of bytes to store the result
894 * @outlen: the length of @out
895 * @inb: a pointer to an array of UTF-16 passwd as a byte array
896 * @inlenb: the length of @in in UTF-16 chars
897 *
898 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000899 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000900 * is the same between the native type of this machine and the
901 * inputed one.
902 *
903 * Returns the number of byte written, or -1 by lack of space, or -2
904 * if the transcoding fails (for *in is not valid utf16 string)
905 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000906 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000907 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000908static int
Owen Taylor3473f882001-02-23 17:55:21 +0000909UTF16BEToUTF8(unsigned char* out, int *outlen,
910 const unsigned char* inb, int *inlenb)
911{
912 unsigned char* outstart = out;
913 const unsigned char* processed = inb;
914 unsigned char* outend = out + *outlen;
915 unsigned short* in = (unsigned short*) inb;
916 unsigned short* inend;
917 unsigned int c, d, inlen;
918 unsigned char *tmp;
919 int bits;
920
921 if ((*inlenb % 2) == 1)
922 (*inlenb)--;
923 inlen = *inlenb / 2;
924 inend= in + inlen;
925 while (in < inend) {
926 if (xmlLittleEndian) {
927 tmp = (unsigned char *) in;
928 c = *tmp++;
929 c = c << 8;
930 c = c | (unsigned int) *tmp;
931 in++;
932 } else {
933 c= *in++;
934 }
935 if ((c & 0xFC00) == 0xD800) { /* surrogates */
936 if (in >= inend) { /* (in > inend) shouldn't happens */
937 *outlen = out - outstart;
938 *inlenb = processed - inb;
939 return(-2);
940 }
941 if (xmlLittleEndian) {
942 tmp = (unsigned char *) in;
943 d = *tmp++;
944 d = d << 8;
945 d = d | (unsigned int) *tmp;
946 in++;
947 } else {
948 d= *in++;
949 }
950 if ((d & 0xFC00) == 0xDC00) {
951 c &= 0x03FF;
952 c <<= 10;
953 c |= d & 0x03FF;
954 c += 0x10000;
955 }
956 else {
957 *outlen = out - outstart;
958 *inlenb = processed - inb;
959 return(-2);
960 }
961 }
962
963 /* assertion: c is a single UTF-4 value */
964 if (out >= outend)
965 break;
966 if (c < 0x80) { *out++= c; bits= -6; }
967 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
968 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
969 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
970
971 for ( ; bits >= 0; bits-= 6) {
972 if (out >= outend)
973 break;
974 *out++= ((c >> bits) & 0x3F) | 0x80;
975 }
976 processed = (const unsigned char*) in;
977 }
978 *outlen = out - outstart;
979 *inlenb = processed - inb;
980 return(0);
981}
982
983/**
984 * UTF8ToUTF16BE:
985 * @outb: a pointer to an array of bytes to store the result
986 * @outlen: the length of @outb
987 * @in: a pointer to an array of UTF-8 chars
988 * @inlen: the length of @in
989 *
990 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
991 * block of chars out.
992 *
993 * Returns the number of byte written, or -1 by lack of space, or -2
994 * if the transcoding failed.
995 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000996static int
Owen Taylor3473f882001-02-23 17:55:21 +0000997UTF8ToUTF16BE(unsigned char* outb, int *outlen,
998 const unsigned char* in, int *inlen)
999{
1000 unsigned short* out = (unsigned short*) outb;
1001 const unsigned char* processed = in;
1002 unsigned short* outstart= out;
1003 unsigned short* outend;
1004 const unsigned char* inend= in+*inlen;
1005 unsigned int c, d;
1006 int trailing;
1007 unsigned char *tmp;
1008 unsigned short tmp1, tmp2;
1009
1010 if (in == NULL) {
1011 /*
1012 * initialization, add the Byte Order Mark
1013 */
1014 if (*outlen >= 2) {
1015 outb[0] = 0xFE;
1016 outb[1] = 0xFF;
1017 *outlen = 2;
1018 *inlen = 0;
1019#ifdef DEBUG_ENCODING
1020 xmlGenericError(xmlGenericErrorContext,
1021 "Added FEFF Byte Order Mark\n");
1022#endif
1023 return(2);
1024 }
1025 *outlen = 0;
1026 *inlen = 0;
1027 return(0);
1028 }
1029 outend = out + (*outlen / 2);
1030 while (in < inend) {
1031 d= *in++;
1032 if (d < 0x80) { c= d; trailing= 0; }
1033 else if (d < 0xC0) {
1034 /* trailing byte in leading position */
1035 *outlen = out - outstart;
1036 *inlen = processed - in;
1037 return(-2);
1038 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1039 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1040 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1041 else {
1042 /* no chance for this in UTF-16 */
1043 *outlen = out - outstart;
1044 *inlen = processed - in;
1045 return(-2);
1046 }
1047
1048 if (inend - in < trailing) {
1049 break;
1050 }
1051
1052 for ( ; trailing; trailing--) {
1053 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1054 c <<= 6;
1055 c |= d & 0x3F;
1056 }
1057
1058 /* assertion: c is a single UTF-4 value */
1059 if (c < 0x10000) {
1060 if (out >= outend) break;
1061 if (xmlLittleEndian) {
1062 tmp = (unsigned char *) out;
1063 *tmp = c >> 8;
1064 *(tmp + 1) = c;
1065 out++;
1066 } else {
1067 *out++ = c;
1068 }
1069 }
1070 else if (c < 0x110000) {
1071 if (out+1 >= outend) break;
1072 c -= 0x10000;
1073 if (xmlLittleEndian) {
1074 tmp1 = 0xD800 | (c >> 10);
1075 tmp = (unsigned char *) out;
1076 *tmp = tmp1 >> 8;
1077 *(tmp + 1) = (unsigned char) tmp1;
1078 out++;
1079
1080 tmp2 = 0xDC00 | (c & 0x03FF);
1081 tmp = (unsigned char *) out;
1082 *tmp = tmp2 >> 8;
1083 *(tmp + 1) = (unsigned char) tmp2;
1084 out++;
1085 } else {
1086 *out++ = 0xD800 | (c >> 10);
1087 *out++ = 0xDC00 | (c & 0x03FF);
1088 }
1089 }
1090 else
1091 break;
1092 processed = in;
1093 }
1094 *outlen = (out - outstart) * 2;
1095 *inlen = processed - in;
1096 return(0);
1097}
1098
Daniel Veillard97ac1312001-05-30 19:14:17 +00001099/************************************************************************
1100 * *
1101 * Generic encoding handling routines *
1102 * *
1103 ************************************************************************/
1104
Owen Taylor3473f882001-02-23 17:55:21 +00001105/**
1106 * xmlDetectCharEncoding:
1107 * @in: a pointer to the first bytes of the XML entity, must be at least
1108 * 4 bytes long.
1109 * @len: pointer to the length of the buffer
1110 *
1111 * Guess the encoding of the entity using the first bytes of the entity content
1112 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1113 *
1114 * Returns one of the XML_CHAR_ENCODING_... values.
1115 */
1116xmlCharEncoding
1117xmlDetectCharEncoding(const unsigned char* in, int len)
1118{
1119 if (len >= 4) {
1120 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1121 (in[2] == 0x00) && (in[3] == 0x3C))
1122 return(XML_CHAR_ENCODING_UCS4BE);
1123 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1124 (in[2] == 0x00) && (in[3] == 0x00))
1125 return(XML_CHAR_ENCODING_UCS4LE);
1126 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1127 (in[2] == 0x3C) && (in[3] == 0x00))
1128 return(XML_CHAR_ENCODING_UCS4_2143);
1129 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1130 (in[2] == 0x00) && (in[3] == 0x00))
1131 return(XML_CHAR_ENCODING_UCS4_3412);
1132 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1133 (in[2] == 0xA7) && (in[3] == 0x94))
1134 return(XML_CHAR_ENCODING_EBCDIC);
1135 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1136 (in[2] == 0x78) && (in[3] == 0x6D))
1137 return(XML_CHAR_ENCODING_UTF8);
1138 }
Daniel Veillard87a764e2001-06-20 17:41:10 +00001139 if (len >= 3) {
1140 /*
1141 * Errata on XML-1.0 June 20 2001
1142 * We now allow an UTF8 encoded BOM
1143 */
1144 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1145 (in[2] == 0xBF))
1146 return(XML_CHAR_ENCODING_UTF8);
1147 }
Owen Taylor3473f882001-02-23 17:55:21 +00001148 if (len >= 2) {
1149 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1150 return(XML_CHAR_ENCODING_UTF16BE);
1151 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1152 return(XML_CHAR_ENCODING_UTF16LE);
1153 }
1154 return(XML_CHAR_ENCODING_NONE);
1155}
1156
1157/**
1158 * xmlCleanupEncodingAliases:
1159 *
1160 * Unregisters all aliases
1161 */
1162void
1163xmlCleanupEncodingAliases(void) {
1164 int i;
1165
1166 if (xmlCharEncodingAliases == NULL)
1167 return;
1168
1169 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1170 if (xmlCharEncodingAliases[i].name != NULL)
1171 xmlFree((char *) xmlCharEncodingAliases[i].name);
1172 if (xmlCharEncodingAliases[i].alias != NULL)
1173 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1174 }
1175 xmlCharEncodingAliasesNb = 0;
1176 xmlCharEncodingAliasesMax = 0;
1177 xmlFree(xmlCharEncodingAliases);
Daniel Veillard73c6e532002-01-08 13:15:33 +00001178 xmlCharEncodingAliases = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001179}
1180
1181/**
1182 * xmlGetEncodingAlias:
1183 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1184 *
1185 * Lookup an encoding name for the given alias.
1186 *
1187 * Returns NULL if not found the original name otherwise
1188 */
1189const char *
1190xmlGetEncodingAlias(const char *alias) {
1191 int i;
1192 char upper[100];
1193
1194 if (alias == NULL)
1195 return(NULL);
1196
1197 if (xmlCharEncodingAliases == NULL)
1198 return(NULL);
1199
1200 for (i = 0;i < 99;i++) {
1201 upper[i] = toupper(alias[i]);
1202 if (upper[i] == 0) break;
1203 }
1204 upper[i] = 0;
1205
1206 /*
1207 * Walk down the list looking for a definition of the alias
1208 */
1209 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1210 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1211 return(xmlCharEncodingAliases[i].name);
1212 }
1213 }
1214 return(NULL);
1215}
1216
1217/**
1218 * xmlAddEncodingAlias:
1219 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1220 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1221 *
1222 * Registers and alias @alias for an encoding named @name. Existing alias
1223 * will be overwritten.
1224 *
1225 * Returns 0 in case of success, -1 in case of error
1226 */
1227int
1228xmlAddEncodingAlias(const char *name, const char *alias) {
1229 int i;
1230 char upper[100];
1231
1232 if ((name == NULL) || (alias == NULL))
1233 return(-1);
1234
1235 for (i = 0;i < 99;i++) {
1236 upper[i] = toupper(alias[i]);
1237 if (upper[i] == 0) break;
1238 }
1239 upper[i] = 0;
1240
1241 if (xmlCharEncodingAliases == NULL) {
1242 xmlCharEncodingAliasesNb = 0;
1243 xmlCharEncodingAliasesMax = 20;
1244 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1245 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1246 if (xmlCharEncodingAliases == NULL)
1247 return(-1);
1248 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1249 xmlCharEncodingAliasesMax *= 2;
1250 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1251 xmlRealloc(xmlCharEncodingAliases,
1252 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1253 }
1254 /*
1255 * Walk down the list looking for a definition of the alias
1256 */
1257 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1258 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1259 /*
1260 * Replace the definition.
1261 */
1262 xmlFree((char *) xmlCharEncodingAliases[i].name);
1263 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1264 return(0);
1265 }
1266 }
1267 /*
1268 * Add the definition
1269 */
1270 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1271 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1272 xmlCharEncodingAliasesNb++;
1273 return(0);
1274}
1275
1276/**
1277 * xmlDelEncodingAlias:
1278 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1279 *
1280 * Unregisters an encoding alias @alias
1281 *
1282 * Returns 0 in case of success, -1 in case of error
1283 */
1284int
1285xmlDelEncodingAlias(const char *alias) {
1286 int i;
1287
1288 if (alias == NULL)
1289 return(-1);
1290
1291 if (xmlCharEncodingAliases == NULL)
1292 return(-1);
1293 /*
1294 * Walk down the list looking for a definition of the alias
1295 */
1296 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1297 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1298 xmlFree((char *) xmlCharEncodingAliases[i].name);
1299 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1300 xmlCharEncodingAliasesNb--;
1301 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1302 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1303 return(0);
1304 }
1305 }
1306 return(-1);
1307}
1308
1309/**
1310 * xmlParseCharEncoding:
1311 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1312 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001313 * Compare the string to the known encoding schemes already known. Note
Owen Taylor3473f882001-02-23 17:55:21 +00001314 * that the comparison is case insensitive accordingly to the section
1315 * [XML] 4.3.3 Character Encoding in Entities.
1316 *
1317 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1318 * if not recognized.
1319 */
1320xmlCharEncoding
1321xmlParseCharEncoding(const char* name)
1322{
1323 const char *alias;
1324 char upper[500];
1325 int i;
1326
1327 if (name == NULL)
1328 return(XML_CHAR_ENCODING_NONE);
1329
1330 /*
1331 * Do the alias resolution
1332 */
1333 alias = xmlGetEncodingAlias(name);
1334 if (alias != NULL)
1335 name = alias;
1336
1337 for (i = 0;i < 499;i++) {
1338 upper[i] = toupper(name[i]);
1339 if (upper[i] == 0) break;
1340 }
1341 upper[i] = 0;
1342
1343 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1344 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1345 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1346
1347 /*
1348 * NOTE: if we were able to parse this, the endianness of UTF16 is
1349 * already found and in use
1350 */
1351 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1352 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1353
1354 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1355 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1356 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1357
1358 /*
1359 * NOTE: if we were able to parse this, the endianness of UCS4 is
1360 * already found and in use
1361 */
1362 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1363 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1364 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1365
1366
1367 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1368 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1369 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1370
1371 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1372 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1373 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1374
1375 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1376 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1377 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1378 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1379 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1380 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1381 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1382
1383 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1384 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1385 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1386
1387#ifdef DEBUG_ENCODING
1388 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1389#endif
1390 return(XML_CHAR_ENCODING_ERROR);
1391}
1392
1393/**
1394 * xmlGetCharEncodingName:
1395 * @enc: the encoding
1396 *
1397 * The "canonical" name for XML encoding.
1398 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1399 * Section 4.3.3 Character Encoding in Entities
1400 *
1401 * Returns the canonical name for the given encoding
1402 */
1403
1404const char*
1405xmlGetCharEncodingName(xmlCharEncoding enc) {
1406 switch (enc) {
1407 case XML_CHAR_ENCODING_ERROR:
1408 return(NULL);
1409 case XML_CHAR_ENCODING_NONE:
1410 return(NULL);
1411 case XML_CHAR_ENCODING_UTF8:
1412 return("UTF-8");
1413 case XML_CHAR_ENCODING_UTF16LE:
1414 return("UTF-16");
1415 case XML_CHAR_ENCODING_UTF16BE:
1416 return("UTF-16");
1417 case XML_CHAR_ENCODING_EBCDIC:
1418 return("EBCDIC");
1419 case XML_CHAR_ENCODING_UCS4LE:
1420 return("ISO-10646-UCS-4");
1421 case XML_CHAR_ENCODING_UCS4BE:
1422 return("ISO-10646-UCS-4");
1423 case XML_CHAR_ENCODING_UCS4_2143:
1424 return("ISO-10646-UCS-4");
1425 case XML_CHAR_ENCODING_UCS4_3412:
1426 return("ISO-10646-UCS-4");
1427 case XML_CHAR_ENCODING_UCS2:
1428 return("ISO-10646-UCS-2");
1429 case XML_CHAR_ENCODING_8859_1:
1430 return("ISO-8859-1");
1431 case XML_CHAR_ENCODING_8859_2:
1432 return("ISO-8859-2");
1433 case XML_CHAR_ENCODING_8859_3:
1434 return("ISO-8859-3");
1435 case XML_CHAR_ENCODING_8859_4:
1436 return("ISO-8859-4");
1437 case XML_CHAR_ENCODING_8859_5:
1438 return("ISO-8859-5");
1439 case XML_CHAR_ENCODING_8859_6:
1440 return("ISO-8859-6");
1441 case XML_CHAR_ENCODING_8859_7:
1442 return("ISO-8859-7");
1443 case XML_CHAR_ENCODING_8859_8:
1444 return("ISO-8859-8");
1445 case XML_CHAR_ENCODING_8859_9:
1446 return("ISO-8859-9");
1447 case XML_CHAR_ENCODING_2022_JP:
1448 return("ISO-2022-JP");
1449 case XML_CHAR_ENCODING_SHIFT_JIS:
1450 return("Shift-JIS");
1451 case XML_CHAR_ENCODING_EUC_JP:
1452 return("EUC-JP");
1453 case XML_CHAR_ENCODING_ASCII:
1454 return(NULL);
1455 }
1456 return(NULL);
1457}
1458
Daniel Veillard97ac1312001-05-30 19:14:17 +00001459/************************************************************************
1460 * *
1461 * Char encoding handlers *
1462 * *
1463 ************************************************************************/
1464
Owen Taylor3473f882001-02-23 17:55:21 +00001465
1466/* the size should be growable, but it's not a big deal ... */
1467#define MAX_ENCODING_HANDLERS 50
1468static xmlCharEncodingHandlerPtr *handlers = NULL;
1469static int nbCharEncodingHandler = 0;
1470
1471/*
1472 * The default is UTF-8 for XML, that's also the default used for the
1473 * parser internals, so the default encoding handler is NULL
1474 */
1475
1476static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1477
1478/**
1479 * xmlNewCharEncodingHandler:
1480 * @name: the encoding name, in UTF-8 format (ASCII actually)
1481 * @input: the xmlCharEncodingInputFunc to read that encoding
1482 * @output: the xmlCharEncodingOutputFunc to write that encoding
1483 *
1484 * Create and registers an xmlCharEncodingHandler.
1485 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1486 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +00001487static xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001488xmlNewCharEncodingHandler(const char *name,
1489 xmlCharEncodingInputFunc input,
1490 xmlCharEncodingOutputFunc output) {
1491 xmlCharEncodingHandlerPtr handler;
1492 const char *alias;
1493 char upper[500];
1494 int i;
1495 char *up = 0;
1496
1497 /*
1498 * Do the alias resolution
1499 */
1500 alias = xmlGetEncodingAlias(name);
1501 if (alias != NULL)
1502 name = alias;
1503
1504 /*
1505 * Keep only the uppercase version of the encoding.
1506 */
1507 if (name == NULL) {
1508 xmlGenericError(xmlGenericErrorContext,
1509 "xmlNewCharEncodingHandler : no name !\n");
1510 return(NULL);
1511 }
1512 for (i = 0;i < 499;i++) {
1513 upper[i] = toupper(name[i]);
1514 if (upper[i] == 0) break;
1515 }
1516 upper[i] = 0;
1517 up = xmlMemStrdup(upper);
1518 if (up == NULL) {
1519 xmlGenericError(xmlGenericErrorContext,
1520 "xmlNewCharEncodingHandler : out of memory !\n");
1521 return(NULL);
1522 }
1523
1524 /*
1525 * allocate and fill-up an handler block.
1526 */
1527 handler = (xmlCharEncodingHandlerPtr)
1528 xmlMalloc(sizeof(xmlCharEncodingHandler));
1529 if (handler == NULL) {
1530 xmlGenericError(xmlGenericErrorContext,
1531 "xmlNewCharEncodingHandler : out of memory !\n");
1532 return(NULL);
1533 }
1534 handler->input = input;
1535 handler->output = output;
1536 handler->name = up;
1537
1538#ifdef LIBXML_ICONV_ENABLED
1539 handler->iconv_in = NULL;
1540 handler->iconv_out = NULL;
1541#endif /* LIBXML_ICONV_ENABLED */
1542
1543 /*
1544 * registers and returns the handler.
1545 */
1546 xmlRegisterCharEncodingHandler(handler);
1547#ifdef DEBUG_ENCODING
1548 xmlGenericError(xmlGenericErrorContext,
1549 "Registered encoding handler for %s\n", name);
1550#endif
1551 return(handler);
1552}
1553
1554/**
1555 * xmlInitCharEncodingHandlers:
1556 *
1557 * Initialize the char encoding support, it registers the default
1558 * encoding supported.
1559 * NOTE: while public, this function usually doesn't need to be called
1560 * in normal processing.
1561 */
1562void
1563xmlInitCharEncodingHandlers(void) {
1564 unsigned short int tst = 0x1234;
1565 unsigned char *ptr = (unsigned char *) &tst;
1566
1567 if (handlers != NULL) return;
1568
1569 handlers = (xmlCharEncodingHandlerPtr *)
1570 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1571
1572 if (*ptr == 0x12) xmlLittleEndian = 0;
1573 else if (*ptr == 0x34) xmlLittleEndian = 1;
1574 else xmlGenericError(xmlGenericErrorContext,
1575 "Odd problem at endianness detection\n");
1576
1577 if (handlers == NULL) {
1578 xmlGenericError(xmlGenericErrorContext,
1579 "xmlInitCharEncodingHandlers : out of memory !\n");
1580 return;
1581 }
1582 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1583 xmlUTF16LEHandler =
1584 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1585 xmlUTF16BEHandler =
1586 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1587 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1588 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001589 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001590#ifdef LIBXML_HTML_ENABLED
1591 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1592#endif
1593}
1594
1595/**
1596 * xmlCleanupCharEncodingHandlers:
1597 *
1598 * Cleanup the memory allocated for the char encoding support, it
1599 * unregisters all the encoding handlers and the aliases.
1600 */
1601void
1602xmlCleanupCharEncodingHandlers(void) {
1603 xmlCleanupEncodingAliases();
1604
1605 if (handlers == NULL) return;
1606
1607 for (;nbCharEncodingHandler > 0;) {
1608 nbCharEncodingHandler--;
1609 if (handlers[nbCharEncodingHandler] != NULL) {
1610 if (handlers[nbCharEncodingHandler]->name != NULL)
1611 xmlFree(handlers[nbCharEncodingHandler]->name);
1612 xmlFree(handlers[nbCharEncodingHandler]);
1613 }
1614 }
1615 xmlFree(handlers);
1616 handlers = NULL;
1617 nbCharEncodingHandler = 0;
1618 xmlDefaultCharEncodingHandler = NULL;
1619}
1620
1621/**
1622 * xmlRegisterCharEncodingHandler:
1623 * @handler: the xmlCharEncodingHandlerPtr handler block
1624 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001625 * Register the char encoding handler, surprising, isn't it ?
Owen Taylor3473f882001-02-23 17:55:21 +00001626 */
1627void
1628xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1629 if (handlers == NULL) xmlInitCharEncodingHandlers();
1630 if (handler == NULL) {
1631 xmlGenericError(xmlGenericErrorContext,
1632 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1633 return;
1634 }
1635
1636 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1637 xmlGenericError(xmlGenericErrorContext,
1638 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1639 xmlGenericError(xmlGenericErrorContext,
1640 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1641 return;
1642 }
1643 handlers[nbCharEncodingHandler++] = handler;
1644}
1645
1646/**
1647 * xmlGetCharEncodingHandler:
1648 * @enc: an xmlCharEncoding value.
1649 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001650 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001651 *
1652 * Returns the handler or NULL if not found
1653 */
1654xmlCharEncodingHandlerPtr
1655xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1656 xmlCharEncodingHandlerPtr handler;
1657
1658 if (handlers == NULL) xmlInitCharEncodingHandlers();
1659 switch (enc) {
1660 case XML_CHAR_ENCODING_ERROR:
1661 return(NULL);
1662 case XML_CHAR_ENCODING_NONE:
1663 return(NULL);
1664 case XML_CHAR_ENCODING_UTF8:
1665 return(NULL);
1666 case XML_CHAR_ENCODING_UTF16LE:
1667 return(xmlUTF16LEHandler);
1668 case XML_CHAR_ENCODING_UTF16BE:
1669 return(xmlUTF16BEHandler);
1670 case XML_CHAR_ENCODING_EBCDIC:
1671 handler = xmlFindCharEncodingHandler("EBCDIC");
1672 if (handler != NULL) return(handler);
1673 handler = xmlFindCharEncodingHandler("ebcdic");
1674 if (handler != NULL) return(handler);
1675 break;
1676 case XML_CHAR_ENCODING_UCS4BE:
1677 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1678 if (handler != NULL) return(handler);
1679 handler = xmlFindCharEncodingHandler("UCS-4");
1680 if (handler != NULL) return(handler);
1681 handler = xmlFindCharEncodingHandler("UCS4");
1682 if (handler != NULL) return(handler);
1683 break;
1684 case XML_CHAR_ENCODING_UCS4LE:
1685 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1686 if (handler != NULL) return(handler);
1687 handler = xmlFindCharEncodingHandler("UCS-4");
1688 if (handler != NULL) return(handler);
1689 handler = xmlFindCharEncodingHandler("UCS4");
1690 if (handler != NULL) return(handler);
1691 break;
1692 case XML_CHAR_ENCODING_UCS4_2143:
1693 break;
1694 case XML_CHAR_ENCODING_UCS4_3412:
1695 break;
1696 case XML_CHAR_ENCODING_UCS2:
1697 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1698 if (handler != NULL) return(handler);
1699 handler = xmlFindCharEncodingHandler("UCS-2");
1700 if (handler != NULL) return(handler);
1701 handler = xmlFindCharEncodingHandler("UCS2");
1702 if (handler != NULL) return(handler);
1703 break;
1704
1705 /*
1706 * We used to keep ISO Latin encodings native in the
1707 * generated data. This led to so many problems that
1708 * this has been removed. One can still change this
1709 * back by registering no-ops encoders for those
1710 */
1711 case XML_CHAR_ENCODING_8859_1:
1712 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1713 if (handler != NULL) return(handler);
1714 break;
1715 case XML_CHAR_ENCODING_8859_2:
1716 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1717 if (handler != NULL) return(handler);
1718 break;
1719 case XML_CHAR_ENCODING_8859_3:
1720 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1721 if (handler != NULL) return(handler);
1722 break;
1723 case XML_CHAR_ENCODING_8859_4:
1724 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1725 if (handler != NULL) return(handler);
1726 break;
1727 case XML_CHAR_ENCODING_8859_5:
1728 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1729 if (handler != NULL) return(handler);
1730 break;
1731 case XML_CHAR_ENCODING_8859_6:
1732 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1733 if (handler != NULL) return(handler);
1734 break;
1735 case XML_CHAR_ENCODING_8859_7:
1736 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1737 if (handler != NULL) return(handler);
1738 break;
1739 case XML_CHAR_ENCODING_8859_8:
1740 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1741 if (handler != NULL) return(handler);
1742 break;
1743 case XML_CHAR_ENCODING_8859_9:
1744 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1745 if (handler != NULL) return(handler);
1746 break;
1747
1748
1749 case XML_CHAR_ENCODING_2022_JP:
1750 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1751 if (handler != NULL) return(handler);
1752 break;
1753 case XML_CHAR_ENCODING_SHIFT_JIS:
1754 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1755 if (handler != NULL) return(handler);
1756 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1757 if (handler != NULL) return(handler);
1758 handler = xmlFindCharEncodingHandler("Shift_JIS");
1759 if (handler != NULL) return(handler);
1760 break;
1761 case XML_CHAR_ENCODING_EUC_JP:
1762 handler = xmlFindCharEncodingHandler("EUC-JP");
1763 if (handler != NULL) return(handler);
1764 break;
1765 default:
1766 break;
1767 }
1768
1769#ifdef DEBUG_ENCODING
1770 xmlGenericError(xmlGenericErrorContext,
1771 "No handler found for encoding %d\n", enc);
1772#endif
1773 return(NULL);
1774}
1775
1776/**
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001777 * xmlFindCharEncodingHandler:
1778 * @name: a string describing the char encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001779 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001780 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001781 *
1782 * Returns the handler or NULL if not found
1783 */
1784xmlCharEncodingHandlerPtr
1785xmlFindCharEncodingHandler(const char *name) {
1786 const char *nalias;
1787 const char *norig;
1788 xmlCharEncoding alias;
1789#ifdef LIBXML_ICONV_ENABLED
1790 xmlCharEncodingHandlerPtr enc;
1791 iconv_t icv_in, icv_out;
1792#endif /* LIBXML_ICONV_ENABLED */
1793 char upper[100];
1794 int i;
1795
1796 if (handlers == NULL) xmlInitCharEncodingHandlers();
1797 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1798 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1799
1800 /*
1801 * Do the alias resolution
1802 */
1803 norig = name;
1804 nalias = xmlGetEncodingAlias(name);
1805 if (nalias != NULL)
1806 name = nalias;
1807
1808 /*
1809 * Check first for directly registered encoding names
1810 */
1811 for (i = 0;i < 99;i++) {
1812 upper[i] = toupper(name[i]);
1813 if (upper[i] == 0) break;
1814 }
1815 upper[i] = 0;
1816
1817 for (i = 0;i < nbCharEncodingHandler; i++)
1818 if (!strcmp(upper, handlers[i]->name)) {
1819#ifdef DEBUG_ENCODING
1820 xmlGenericError(xmlGenericErrorContext,
1821 "Found registered handler for encoding %s\n", name);
1822#endif
1823 return(handlers[i]);
1824 }
1825
1826#ifdef LIBXML_ICONV_ENABLED
1827 /* check whether iconv can handle this */
1828 icv_in = iconv_open("UTF-8", name);
1829 icv_out = iconv_open(name, "UTF-8");
1830 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1831 enc = (xmlCharEncodingHandlerPtr)
1832 xmlMalloc(sizeof(xmlCharEncodingHandler));
1833 if (enc == NULL) {
1834 iconv_close(icv_in);
1835 iconv_close(icv_out);
1836 return(NULL);
1837 }
1838 enc->name = xmlMemStrdup(name);
1839 enc->input = NULL;
1840 enc->output = NULL;
1841 enc->iconv_in = icv_in;
1842 enc->iconv_out = icv_out;
1843#ifdef DEBUG_ENCODING
1844 xmlGenericError(xmlGenericErrorContext,
1845 "Found iconv handler for encoding %s\n", name);
1846#endif
1847 return enc;
1848 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1849 xmlGenericError(xmlGenericErrorContext,
1850 "iconv : problems with filters for '%s'\n", name);
1851 }
1852#endif /* LIBXML_ICONV_ENABLED */
1853
1854#ifdef DEBUG_ENCODING
1855 xmlGenericError(xmlGenericErrorContext,
1856 "No handler found for encoding %s\n", name);
1857#endif
1858
1859 /*
1860 * Fallback using the canonical names
1861 */
1862 alias = xmlParseCharEncoding(norig);
1863 if (alias != XML_CHAR_ENCODING_ERROR) {
1864 const char* canon;
1865 canon = xmlGetCharEncodingName(alias);
1866 if ((canon != NULL) && (strcmp(name, canon))) {
1867 return(xmlFindCharEncodingHandler(canon));
1868 }
1869 }
1870
1871 return(NULL);
1872}
1873
Daniel Veillard97ac1312001-05-30 19:14:17 +00001874/************************************************************************
1875 * *
1876 * ICONV based generic conversion functions *
1877 * *
1878 ************************************************************************/
1879
Owen Taylor3473f882001-02-23 17:55:21 +00001880#ifdef LIBXML_ICONV_ENABLED
1881/**
1882 * xmlIconvWrapper:
1883 * @cd: iconv converter data structure
1884 * @out: a pointer to an array of bytes to store the result
1885 * @outlen: the length of @out
1886 * @in: a pointer to an array of ISO Latin 1 chars
1887 * @inlen: the length of @in
1888 *
1889 * Returns 0 if success, or
1890 * -1 by lack of space, or
1891 * -2 if the transcoding fails (for *in is not valid utf8 string or
1892 * the result of transformation can't fit into the encoding we want), or
1893 * -3 if there the last byte can't form a single output char.
1894 *
1895 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001896 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001897 * The value of @outlen after return is the number of ocetes consumed.
1898 */
1899static int
1900xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001901 unsigned char *out, int *outlen,
1902 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001903
Daniel Veillard9403a042001-05-28 11:00:53 +00001904 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1905 const char *icv_in = (const char *) in;
1906 char *icv_out = (char *) out;
1907 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001908
Darin Adler699613b2001-07-27 22:47:14 +00001909 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard9403a042001-05-28 11:00:53 +00001910 if (in != NULL) {
1911 *inlen -= icv_inlen;
1912 *outlen -= icv_outlen;
1913 } else {
1914 *inlen = 0;
1915 *outlen = 0;
1916 }
1917 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001918#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00001919 if (errno == EILSEQ) {
1920 return -2;
1921 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001922#endif
1923#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00001924 if (errno == E2BIG) {
1925 return -1;
1926 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001927#endif
1928#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00001929 if (errno == EINVAL) {
1930 return -3;
1931 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001932#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00001933 {
1934 return -3;
1935 }
1936 }
1937 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001938}
1939#endif /* LIBXML_ICONV_ENABLED */
1940
Daniel Veillard97ac1312001-05-30 19:14:17 +00001941/************************************************************************
1942 * *
1943 * The real API used by libxml for on-the-fly conversion *
1944 * *
1945 ************************************************************************/
1946
Owen Taylor3473f882001-02-23 17:55:21 +00001947/**
1948 * xmlCharEncFirstLine:
1949 * @handler: char enconding transformation data structure
1950 * @out: an xmlBuffer for the output.
1951 * @in: an xmlBuffer for the input
1952 *
1953 * Front-end for the encoding handler input function, but handle only
1954 * the very first line, i.e. limit itself to 45 chars.
1955 *
1956 * Returns the number of byte written if success, or
1957 * -1 general error
1958 * -2 if the transcoding fails (for *in is not valid utf8 string or
1959 * the result of transformation can't fit into the encoding we want), or
1960 */
1961int
1962xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1963 xmlBufferPtr in) {
1964 int ret = -2;
1965 int written;
1966 int toconv;
1967
1968 if (handler == NULL) return(-1);
1969 if (out == NULL) return(-1);
1970 if (in == NULL) return(-1);
1971
1972 written = out->size - out->use;
1973 toconv = in->use;
1974 if (toconv * 2 >= written) {
1975 xmlBufferGrow(out, toconv);
1976 written = out->size - out->use - 1;
1977 }
1978
1979 /*
1980 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1981 * 45 chars should be sufficient to reach the end of the encoding
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001982 * declaration without going too far inside the document content.
Owen Taylor3473f882001-02-23 17:55:21 +00001983 */
1984 written = 45;
1985
1986 if (handler->input != NULL) {
1987 ret = handler->input(&out->content[out->use], &written,
1988 in->content, &toconv);
1989 xmlBufferShrink(in, toconv);
1990 out->use += written;
1991 out->content[out->use] = 0;
1992 }
1993#ifdef LIBXML_ICONV_ENABLED
1994 else if (handler->iconv_in != NULL) {
1995 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1996 &written, in->content, &toconv);
1997 xmlBufferShrink(in, toconv);
1998 out->use += written;
1999 out->content[out->use] = 0;
2000 if (ret == -1) ret = -3;
2001 }
2002#endif /* LIBXML_ICONV_ENABLED */
2003#ifdef DEBUG_ENCODING
2004 switch (ret) {
2005 case 0:
2006 xmlGenericError(xmlGenericErrorContext,
2007 "converted %d bytes to %d bytes of input\n",
2008 toconv, written);
2009 break;
2010 case -1:
2011 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2012 toconv, written, in->use);
2013 break;
2014 case -2:
2015 xmlGenericError(xmlGenericErrorContext,
2016 "input conversion failed due to input error\n");
2017 break;
2018 case -3:
2019 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2020 toconv, written, in->use);
2021 break;
2022 default:
2023 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2024 }
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002025#endif /* DEBUG_ENCODING */
Owen Taylor3473f882001-02-23 17:55:21 +00002026 /*
2027 * Ignore when input buffer is not on a boundary
2028 */
2029 if (ret == -3) ret = 0;
2030 if (ret == -1) ret = 0;
2031 return(ret);
2032}
2033
2034/**
2035 * xmlCharEncInFunc:
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002036 * @handler: char encoding transformation data structure
Owen Taylor3473f882001-02-23 17:55:21 +00002037 * @out: an xmlBuffer for the output.
2038 * @in: an xmlBuffer for the input
2039 *
2040 * Generic front-end for the encoding handler input function
2041 *
2042 * Returns the number of byte written if success, or
2043 * -1 general error
2044 * -2 if the transcoding fails (for *in is not valid utf8 string or
2045 * the result of transformation can't fit into the encoding we want), or
2046 */
2047int
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002048xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2049 xmlBufferPtr in)
2050{
Owen Taylor3473f882001-02-23 17:55:21 +00002051 int ret = -2;
2052 int written;
2053 int toconv;
2054
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002055 if (handler == NULL)
2056 return (-1);
2057 if (out == NULL)
2058 return (-1);
2059 if (in == NULL)
2060 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00002061
2062 toconv = in->use;
2063 if (toconv == 0)
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002064 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00002065 written = out->size - out->use;
2066 if (toconv * 2 >= written) {
2067 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002068 written = out->size - out->use - 1;
Owen Taylor3473f882001-02-23 17:55:21 +00002069 }
2070 if (handler->input != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002071 ret = handler->input(&out->content[out->use], &written,
2072 in->content, &toconv);
2073 xmlBufferShrink(in, toconv);
2074 out->use += written;
2075 out->content[out->use] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002076 }
2077#ifdef LIBXML_ICONV_ENABLED
2078 else if (handler->iconv_in != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002079 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2080 &written, in->content, &toconv);
2081 xmlBufferShrink(in, toconv);
2082 out->use += written;
2083 out->content[out->use] = 0;
2084 if (ret == -1)
2085 ret = -3;
Owen Taylor3473f882001-02-23 17:55:21 +00002086 }
2087#endif /* LIBXML_ICONV_ENABLED */
2088 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002089 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002090#ifdef DEBUG_ENCODING
2091 xmlGenericError(xmlGenericErrorContext,
2092 "converted %d bytes to %d bytes of input\n",
2093 toconv, written);
Owen Taylor3473f882001-02-23 17:55:21 +00002094#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002095 break;
2096 case -1:
2097#ifdef DEBUG_ENCODING
2098 xmlGenericError(xmlGenericErrorContext,
2099 "converted %d bytes to %d bytes of input, %d left\n",
2100 toconv, written, in->use);
2101#endif
2102 break;
2103 case -3:
2104#ifdef DEBUG_ENCODING
2105 xmlGenericError(xmlGenericErrorContext,
2106 "converted %d bytes to %d bytes of input, %d left\n",
2107 toconv, written, in->use);
2108#endif
2109 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002110 case -2:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002111 xmlGenericError(xmlGenericErrorContext,
2112 "input conversion failed due to input error\n");
2113 xmlGenericError(xmlGenericErrorContext,
2114 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2115 in->content[0], in->content[1],
2116 in->content[2], in->content[3]);
Owen Taylor3473f882001-02-23 17:55:21 +00002117 }
2118 /*
2119 * Ignore when input buffer is not on a boundary
2120 */
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002121 if (ret == -3)
2122 ret = 0;
2123 return (ret);
Owen Taylor3473f882001-02-23 17:55:21 +00002124}
2125
2126/**
2127 * xmlCharEncOutFunc:
2128 * @handler: char enconding transformation data structure
2129 * @out: an xmlBuffer for the output.
2130 * @in: an xmlBuffer for the input
2131 *
2132 * Generic front-end for the encoding handler output function
2133 * a first call with @in == NULL has to be made firs to initiate the
2134 * output in case of non-stateless encoding needing to initiate their
2135 * state or the output (like the BOM in UTF16).
2136 * In case of UTF8 sequence conversion errors for the given encoder,
2137 * the content will be automatically remapped to a CharRef sequence.
2138 *
2139 * Returns the number of byte written if success, or
2140 * -1 general error
2141 * -2 if the transcoding fails (for *in is not valid utf8 string or
2142 * the result of transformation can't fit into the encoding we want), or
2143 */
2144int
2145xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2146 xmlBufferPtr in) {
2147 int ret = -2;
2148 int written;
2149 int writtentot = 0;
2150 int toconv;
2151 int output = 0;
2152
2153 if (handler == NULL) return(-1);
2154 if (out == NULL) return(-1);
2155
2156retry:
2157
2158 written = out->size - out->use;
2159
2160 /*
2161 * First specific handling of in = NULL, i.e. the initialization call
2162 */
2163 if (in == NULL) {
2164 toconv = 0;
2165 if (handler->output != NULL) {
2166 ret = handler->output(&out->content[out->use], &written,
2167 NULL, &toconv);
2168 out->use += written;
2169 out->content[out->use] = 0;
2170 }
2171#ifdef LIBXML_ICONV_ENABLED
2172 else if (handler->iconv_out != NULL) {
2173 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2174 &written, NULL, &toconv);
2175 out->use += written;
2176 out->content[out->use] = 0;
2177 }
2178#endif /* LIBXML_ICONV_ENABLED */
2179#ifdef DEBUG_ENCODING
2180 xmlGenericError(xmlGenericErrorContext,
2181 "initialized encoder\n");
2182#endif
2183 return(0);
2184 }
2185
2186 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002187 * Conversion itself.
Owen Taylor3473f882001-02-23 17:55:21 +00002188 */
2189 toconv = in->use;
2190 if (toconv == 0)
2191 return(0);
2192 if (toconv * 2 >= written) {
2193 xmlBufferGrow(out, toconv * 2);
2194 written = out->size - out->use - 1;
2195 }
2196 if (handler->output != NULL) {
2197 ret = handler->output(&out->content[out->use], &written,
2198 in->content, &toconv);
2199 xmlBufferShrink(in, toconv);
2200 out->use += written;
2201 writtentot += written;
2202 out->content[out->use] = 0;
2203 }
2204#ifdef LIBXML_ICONV_ENABLED
2205 else if (handler->iconv_out != NULL) {
2206 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2207 &written, in->content, &toconv);
2208 xmlBufferShrink(in, toconv);
2209 out->use += written;
2210 writtentot += written;
2211 out->content[out->use] = 0;
2212 if (ret == -1) {
2213 if (written > 0) {
2214 /*
2215 * Can be a limitation of iconv
2216 */
2217 goto retry;
2218 }
2219 ret = -3;
2220 }
2221 }
2222#endif /* LIBXML_ICONV_ENABLED */
2223 else {
2224 xmlGenericError(xmlGenericErrorContext,
2225 "xmlCharEncOutFunc: no output function !\n");
2226 return(-1);
2227 }
2228
2229 if (ret >= 0) output += ret;
2230
2231 /*
2232 * Attempt to handle error cases
2233 */
2234 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002235 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002236#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002237 xmlGenericError(xmlGenericErrorContext,
2238 "converted %d bytes to %d bytes of output\n",
2239 toconv, written);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002240#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002241 break;
2242 case -1:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002243#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002244 xmlGenericError(xmlGenericErrorContext,
2245 "output conversion failed by lack of space\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002246#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002247 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002248 case -3:
2249 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2250 toconv, written, in->use);
2251 break;
2252 case -2: {
2253 int len = in->use;
2254 const xmlChar *utf = (const xmlChar *) in->content;
2255 int cur;
2256
2257 cur = xmlGetUTF8Char(utf, &len);
2258 if (cur > 0) {
2259 xmlChar charref[20];
2260
2261#ifdef DEBUG_ENCODING
2262 xmlGenericError(xmlGenericErrorContext,
2263 "handling output conversion error\n");
2264 xmlGenericError(xmlGenericErrorContext,
2265 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2266 in->content[0], in->content[1],
2267 in->content[2], in->content[3]);
2268#endif
2269 /*
2270 * Removes the UTF8 sequence, and replace it by a charref
2271 * and continue the transcoding phase, hoping the error
2272 * did not mangle the encoder state.
2273 */
Daniel Veillard16698282001-09-14 10:29:27 +00002274 sprintf((char *) charref, "&#%d;", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002275 xmlBufferShrink(in, len);
2276 xmlBufferAddHead(in, charref, -1);
2277
2278 goto retry;
2279 } else {
2280 xmlGenericError(xmlGenericErrorContext,
2281 "output conversion failed due to conv error\n");
2282 xmlGenericError(xmlGenericErrorContext,
2283 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2284 in->content[0], in->content[1],
2285 in->content[2], in->content[3]);
2286 in->content[0] = ' ';
2287 }
2288 break;
2289 }
2290 }
2291 return(ret);
2292}
2293
2294/**
2295 * xmlCharEncCloseFunc:
2296 * @handler: char enconding transformation data structure
2297 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002298 * Generic front-end for encoding handler close function
Owen Taylor3473f882001-02-23 17:55:21 +00002299 *
2300 * Returns 0 if success, or -1 in case of error
2301 */
2302int
2303xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2304 int ret = 0;
2305 if (handler == NULL) return(-1);
2306 if (handler->name == NULL) return(-1);
2307#ifdef LIBXML_ICONV_ENABLED
2308 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002309 * Iconv handlers can be used only once, free the whole block.
Owen Taylor3473f882001-02-23 17:55:21 +00002310 * and the associated icon resources.
2311 */
2312 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2313 if (handler->name != NULL)
2314 xmlFree(handler->name);
2315 handler->name = NULL;
2316 if (handler->iconv_out != NULL) {
2317 if (iconv_close(handler->iconv_out))
2318 ret = -1;
2319 handler->iconv_out = NULL;
2320 }
2321 if (handler->iconv_in != NULL) {
2322 if (iconv_close(handler->iconv_in))
2323 ret = -1;
2324 handler->iconv_in = NULL;
2325 }
2326 xmlFree(handler);
2327 }
2328#endif /* LIBXML_ICONV_ENABLED */
2329#ifdef DEBUG_ENCODING
2330 if (ret)
2331 xmlGenericError(xmlGenericErrorContext,
2332 "failed to close the encoding handler\n");
2333 else
2334 xmlGenericError(xmlGenericErrorContext,
2335 "closed the encoding handler\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002336#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002337
Owen Taylor3473f882001-02-23 17:55:21 +00002338 return(ret);
2339}
2340