blob: 69d67cd6b9fcab1ca5a0f1b9f948055000d48599 [file] [log] [blame]
Owen Taylor3473f882001-02-23 17:55:21 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Owen Taylor3473f882001-02-23 17:55:21 +000016 * See Copyright for the status of this software.
17 *
Daniel Veillardc5d64342001-06-24 12:13:24 +000018 * daniel@veillard.com
Daniel Veillard97ac1312001-05-30 19:14:17 +000019 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Owen Taylor3473f882001-02-23 17:55:21 +000024 */
25
Daniel Veillard34ce8be2002-03-18 19:37:11 +000026#define IN_LIBXML
Bjorn Reese70a9da52001-04-21 16:57:29 +000027#include "libxml.h"
Owen Taylor3473f882001-02-23 17:55:21 +000028
Owen Taylor3473f882001-02-23 17:55:21 +000029#include <string.h>
30
31#ifdef HAVE_CTYPE_H
32#include <ctype.h>
33#endif
34#ifdef HAVE_STDLIB_H
35#include <stdlib.h>
36#endif
Owen Taylor3473f882001-02-23 17:55:21 +000037#ifdef LIBXML_ICONV_ENABLED
38#ifdef HAVE_ERRNO_H
39#include <errno.h>
40#endif
41#endif
42#include <libxml/encoding.h>
43#include <libxml/xmlmemory.h>
44#ifdef LIBXML_HTML_ENABLED
45#include <libxml/HTMLparser.h>
46#endif
Daniel Veillard64a411c2001-10-15 12:32:07 +000047#include <libxml/globals.h>
Daniel Veillarda4617b82001-11-04 20:19:12 +000048#include <libxml/xmlerror.h>
Owen Taylor3473f882001-02-23 17:55:21 +000049
Daniel Veillard22090732001-07-16 00:06:07 +000050static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +000052
53typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
54typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
55struct _xmlCharEncodingAlias {
56 const char *name;
57 const char *alias;
58};
59
60static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
61static int xmlCharEncodingAliasesNb = 0;
62static int xmlCharEncodingAliasesMax = 0;
63
64#ifdef LIBXML_ICONV_ENABLED
65#if 0
66#define DEBUG_ENCODING /* Define this to get encoding traces */
67#endif
68#endif
69
70static int xmlLittleEndian = 1;
71
Daniel Veillard97ac1312001-05-30 19:14:17 +000072/************************************************************************
73 * *
74 * Generic UTF8 handling routines *
75 * *
76 * From rfc2044: encoding of the Unicode values on UTF-8: *
77 * *
78 * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
79 * 0000 0000-0000 007F 0xxxxxxx *
80 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
81 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
82 * *
83 * I hope we won't use values > 0xFFFF anytime soon ! *
84 * *
85 ************************************************************************/
Owen Taylor3473f882001-02-23 17:55:21 +000086
87/**
Daniel Veillarde043ee12001-04-16 14:08:07 +000088 * xmlUTF8Strlen:
89 * @utf: a sequence of UTF-8 encoded bytes
90 *
Daniel Veillard60087f32001-10-10 09:45:09 +000091 * compute the length of an UTF8 string, it doesn't do a full UTF8
Daniel Veillarde043ee12001-04-16 14:08:07 +000092 * checking of the content of the string.
93 *
94 * Returns the number of characters in the string or -1 in case of error
95 */
96int
Daniel Veillard97ac1312001-05-30 19:14:17 +000097xmlUTF8Strlen(const xmlChar *utf) {
Daniel Veillarde043ee12001-04-16 14:08:07 +000098 int ret = 0;
99
100 if (utf == NULL)
101 return(-1);
102
103 while (*utf != 0) {
104 if (utf[0] & 0x80) {
105 if ((utf[1] & 0xc0) != 0x80)
106 return(-1);
107 if ((utf[0] & 0xe0) == 0xe0) {
108 if ((utf[2] & 0xc0) != 0x80)
109 return(-1);
110 if ((utf[0] & 0xf0) == 0xf0) {
111 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
112 return(-1);
113 utf += 4;
114 } else {
115 utf += 3;
116 }
117 } else {
118 utf += 2;
119 }
120 } else {
121 utf++;
122 }
123 ret++;
124 }
125 return(ret);
126}
127
128/**
Owen Taylor3473f882001-02-23 17:55:21 +0000129 * xmlGetUTF8Char:
130 * @utf: a sequence of UTF-8 encoded bytes
131 * @len: a pointer to @bytes len
132 *
133 * Read one UTF8 Char from @utf
134 *
135 * Returns the char value or -1 in case of error and update @len with the
136 * number of bytes used
137 */
Daniel Veillardf000f072002-10-22 14:28:17 +0000138int
Owen Taylor3473f882001-02-23 17:55:21 +0000139xmlGetUTF8Char(const unsigned char *utf, int *len) {
140 unsigned int c;
141
142 if (utf == NULL)
143 goto error;
144 if (len == NULL)
145 goto error;
146 if (*len < 1)
147 goto error;
148
149 c = utf[0];
150 if (c & 0x80) {
151 if (*len < 2)
152 goto error;
153 if ((utf[1] & 0xc0) != 0x80)
154 goto error;
155 if ((c & 0xe0) == 0xe0) {
156 if (*len < 3)
157 goto error;
158 if ((utf[2] & 0xc0) != 0x80)
159 goto error;
160 if ((c & 0xf0) == 0xf0) {
161 if (*len < 4)
162 goto error;
163 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
164 goto error;
165 *len = 4;
166 /* 4-byte code */
167 c = (utf[0] & 0x7) << 18;
168 c |= (utf[1] & 0x3f) << 12;
169 c |= (utf[2] & 0x3f) << 6;
170 c |= utf[3] & 0x3f;
171 } else {
172 /* 3-byte code */
173 *len = 3;
174 c = (utf[0] & 0xf) << 12;
175 c |= (utf[1] & 0x3f) << 6;
176 c |= utf[2] & 0x3f;
177 }
178 } else {
179 /* 2-byte code */
180 *len = 2;
181 c = (utf[0] & 0x1f) << 6;
182 c |= utf[1] & 0x3f;
183 }
184 } else {
185 /* 1-byte code */
186 *len = 1;
187 }
188 return(c);
189
190error:
191 *len = 0;
192 return(-1);
193}
194
195/**
Daniel Veillard01c13b52002-12-10 15:19:08 +0000196 * xmlCheckUTF8:
Owen Taylor3473f882001-02-23 17:55:21 +0000197 * @utf: Pointer to putative utf-8 encoded string.
198 *
199 * Checks @utf for being valid utf-8. @utf is assumed to be
200 * null-terminated. This function is not super-strict, as it will
201 * allow longer utf-8 sequences than necessary. Note that Java is
202 * capable of producing these sequences if provoked. Also note, this
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000203 * routine checks for the 4-byte maximum size, but does not check for
Owen Taylor3473f882001-02-23 17:55:21 +0000204 * 0x10ffff maximum value.
205 *
206 * Return value: true if @utf is valid.
207 **/
208int
209xmlCheckUTF8(const unsigned char *utf)
210{
211 int ix;
212 unsigned char c;
213
214 for (ix = 0; (c = utf[ix]);) {
215 if (c & 0x80) {
216 if ((utf[ix + 1] & 0xc0) != 0x80)
217 return(0);
218 if ((c & 0xe0) == 0xe0) {
219 if ((utf[ix + 2] & 0xc0) != 0x80)
220 return(0);
221 if ((c & 0xf0) == 0xf0) {
222 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
223 return(0);
224 ix += 4;
225 /* 4-byte code */
226 } else
227 /* 3-byte code */
228 ix += 3;
229 } else
230 /* 2-byte code */
231 ix += 2;
232 } else
233 /* 1-byte code */
234 ix++;
235 }
236 return(1);
237}
238
239/**
Daniel Veillard97ac1312001-05-30 19:14:17 +0000240 * xmlUTF8Strsize:
241 * @utf: a sequence of UTF-8 encoded bytes
242 * @len: the number of characters in the array
243 *
244 * storage size of an UTF8 string
245 *
246 * Returns the storage size of
247 * the first 'len' characters of ARRAY
248 *
249 */
250
251int
252xmlUTF8Strsize(const xmlChar *utf, int len) {
253 const xmlChar *ptr=utf;
254 xmlChar ch;
255
256 if (len <= 0)
257 return(0);
258
259 while ( len-- > 0) {
260 if ( !*ptr )
261 break;
262 if ( (ch = *ptr++) & 0x80)
263 while ( (ch<<=1) & 0x80 )
264 ptr++;
265 }
266 return (ptr - utf);
267}
268
269
270/**
271 * xmlUTF8Strndup:
272 * @utf: the input UTF8 *
273 * @len: the len of @utf (in chars)
274 *
275 * a strndup for array of UTF8's
276 *
277 * Returns a new UTF8 * or NULL
278 */
279xmlChar *
280xmlUTF8Strndup(const xmlChar *utf, int len) {
281 xmlChar *ret;
282 int i;
283
284 if ((utf == NULL) || (len < 0)) return(NULL);
285 i = xmlUTF8Strsize(utf, len);
286 ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
287 if (ret == NULL) {
288 xmlGenericError(xmlGenericErrorContext,
289 "malloc of %ld byte failed\n",
290 (len + 1) * (long)sizeof(xmlChar));
291 return(NULL);
292 }
293 memcpy(ret, utf, i * sizeof(xmlChar));
294 ret[i] = 0;
295 return(ret);
296}
297
298/**
299 * xmlUTF8Strpos:
300 * @utf: the input UTF8 *
301 * @pos: the position of the desired UTF8 char (in chars)
302 *
303 * a function to provide the equivalent of fetching a
304 * character from a string array
305 *
306 * Returns a pointer to the UTF8 character or NULL
307 */
308xmlChar *
309xmlUTF8Strpos(const xmlChar *utf, int pos) {
310 xmlChar ch;
311
312 if (utf == NULL) return(NULL);
313 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
314 return(NULL);
315 while (pos--) {
316 if ((ch=*utf++) == 0) return(NULL);
317 if ( ch & 0x80 ) {
318 /* if not simple ascii, verify proper format */
319 if ( (ch & 0xc0) != 0xc0 )
320 return(NULL);
321 /* then skip over remaining bytes for this char */
322 while ( (ch <<= 1) & 0x80 )
323 if ( (*utf++ & 0xc0) != 0x80 )
324 return(NULL);
325 }
326 }
327 return((xmlChar *)utf);
328}
329
330/**
331 * xmlUTF8Strloc:
332 * @utf: the input UTF8 *
333 * @utfchar: the UTF8 character to be found
334 *
335 * a function to provide relative location of a UTF8 char
336 *
337 * Returns the relative character position of the desired char
338 * or -1 if not found
339 */
340int
341xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
342 int i, size;
343 xmlChar ch;
344
345 if (utf==NULL || utfchar==NULL) return -1;
346 size = xmlUTF8Strsize(utfchar, 1);
347 for(i=0; (ch=*utf) != 0; i++) {
348 if (xmlStrncmp(utf, utfchar, size)==0)
349 return(i);
350 utf++;
351 if ( ch & 0x80 ) {
352 /* if not simple ascii, verify proper format */
353 if ( (ch & 0xc0) != 0xc0 )
354 return(-1);
355 /* then skip over remaining bytes for this char */
356 while ( (ch <<= 1) & 0x80 )
357 if ( (*utf++ & 0xc0) != 0x80 )
358 return(-1);
359 }
360 }
361
362 return(-1);
363}
364/**
365 * xmlUTF8Strsub:
366 * @utf: a sequence of UTF-8 encoded bytes
Daniel Veillard97ac1312001-05-30 19:14:17 +0000367 * @start: relative pos of first char
368 * @len: total number to copy
369 *
370 * Note: positions are given in units of UTF-8 chars
371 *
372 * Returns a pointer to a newly created string
373 * or NULL if any problem
374 */
375
376xmlChar *
377xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
378 int i;
379 xmlChar ch;
380
381 if (utf == NULL) return(NULL);
382 if (start < 0) return(NULL);
383 if (len < 0) return(NULL);
384
385 /*
386 * Skip over any leading chars
387 */
388 for (i = 0;i < start;i++) {
389 if ((ch=*utf++) == 0) return(NULL);
390 if ( ch & 0x80 ) {
391 /* if not simple ascii, verify proper format */
392 if ( (ch & 0xc0) != 0xc0 )
393 return(NULL);
394 /* then skip over remaining bytes for this char */
395 while ( (ch <<= 1) & 0x80 )
396 if ( (*utf++ & 0xc0) != 0x80 )
397 return(NULL);
398 }
399 }
400
401 return(xmlUTF8Strndup(utf, len));
402}
403
404/************************************************************************
405 * *
406 * Conversions To/From UTF8 encoding *
407 * *
408 ************************************************************************/
409
410/**
Owen Taylor3473f882001-02-23 17:55:21 +0000411 * asciiToUTF8:
412 * @out: a pointer to an array of bytes to store the result
413 * @outlen: the length of @out
414 * @in: a pointer to an array of ASCII chars
415 * @inlen: the length of @in
416 *
417 * Take a block of ASCII chars in and try to convert it to an UTF-8
418 * block of chars out.
419 * Returns 0 if success, or -1 otherwise
420 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000421 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000422 * The value of @outlen after return is the number of ocetes consumed.
423 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000424static int
Owen Taylor3473f882001-02-23 17:55:21 +0000425asciiToUTF8(unsigned char* out, int *outlen,
426 const unsigned char* in, int *inlen) {
427 unsigned char* outstart = out;
428 const unsigned char* base = in;
429 const unsigned char* processed = in;
430 unsigned char* outend = out + *outlen;
431 const unsigned char* inend;
432 unsigned int c;
433 int bits;
434
435 inend = in + (*inlen);
436 while ((in < inend) && (out - outstart + 5 < *outlen)) {
437 c= *in++;
438
439 /* assertion: c is a single UTF-4 value */
440 if (out >= outend)
441 break;
442 if (c < 0x80) { *out++= c; bits= -6; }
443 else {
444 *outlen = out - outstart;
445 *inlen = processed - base;
446 return(-1);
447 }
448
449 for ( ; bits >= 0; bits-= 6) {
450 if (out >= outend)
451 break;
452 *out++= ((c >> bits) & 0x3F) | 0x80;
453 }
454 processed = (const unsigned char*) in;
455 }
456 *outlen = out - outstart;
457 *inlen = processed - base;
458 return(0);
459}
460
461/**
462 * UTF8Toascii:
463 * @out: a pointer to an array of bytes to store the result
464 * @outlen: the length of @out
465 * @in: a pointer to an array of UTF-8 chars
466 * @inlen: the length of @in
467 *
468 * Take a block of UTF-8 chars in and try to convert it to an ASCII
469 * block of chars out.
470 *
471 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
472 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000473 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000474 * The value of @outlen after return is the number of ocetes consumed.
475 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000476static int
Owen Taylor3473f882001-02-23 17:55:21 +0000477UTF8Toascii(unsigned char* out, int *outlen,
478 const unsigned char* in, int *inlen) {
479 const unsigned char* processed = in;
480 const unsigned char* outend;
481 const unsigned char* outstart = out;
482 const unsigned char* instart = in;
483 const unsigned char* inend;
484 unsigned int c, d;
485 int trailing;
486
487 if (in == NULL) {
488 /*
489 * initialization nothing to do
490 */
491 *outlen = 0;
492 *inlen = 0;
493 return(0);
494 }
495 inend = in + (*inlen);
496 outend = out + (*outlen);
497 while (in < inend) {
498 d = *in++;
499 if (d < 0x80) { c= d; trailing= 0; }
500 else if (d < 0xC0) {
501 /* trailing byte in leading position */
502 *outlen = out - outstart;
503 *inlen = processed - instart;
504 return(-2);
505 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
506 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
507 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
508 else {
509 /* no chance for this in Ascii */
510 *outlen = out - outstart;
511 *inlen = processed - instart;
512 return(-2);
513 }
514
515 if (inend - in < trailing) {
516 break;
517 }
518
519 for ( ; trailing; trailing--) {
520 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
521 break;
522 c <<= 6;
523 c |= d & 0x3F;
524 }
525
526 /* assertion: c is a single UTF-4 value */
527 if (c < 0x80) {
528 if (out >= outend)
529 break;
530 *out++ = c;
531 } else {
532 /* no chance for this in Ascii */
533 *outlen = out - outstart;
534 *inlen = processed - instart;
535 return(-2);
536 }
537 processed = in;
538 }
539 *outlen = out - outstart;
540 *inlen = processed - instart;
541 return(0);
542}
543
544/**
545 * isolat1ToUTF8:
546 * @out: a pointer to an array of bytes to store the result
547 * @outlen: the length of @out
548 * @in: a pointer to an array of ISO Latin 1 chars
549 * @inlen: the length of @in
550 *
551 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
552 * block of chars out.
553 * Returns 0 if success, or -1 otherwise
554 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000555 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000556 * The value of @outlen after return is the number of ocetes consumed.
557 */
558int
559isolat1ToUTF8(unsigned char* out, int *outlen,
560 const unsigned char* in, int *inlen) {
561 unsigned char* outstart = out;
562 const unsigned char* base = in;
Owen Taylor3473f882001-02-23 17:55:21 +0000563 unsigned char* outend = out + *outlen;
564 const unsigned char* inend;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000565 const unsigned char* instop;
566 xmlChar c = *in;
Owen Taylor3473f882001-02-23 17:55:21 +0000567
568 inend = in + (*inlen);
Daniel Veillarde72c7562002-05-31 09:47:30 +0000569 instop = inend;
570
571 while (in < inend && out < outend - 1) {
572 if (c >= 0x80) {
Daniel Veillarddb552912002-03-21 13:27:59 +0000573 *out++= ((c >> 6) & 0x1F) | 0xC0;
Daniel Veillard02141ea2001-04-30 11:46:40 +0000574 *out++= (c & 0x3F) | 0x80;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000575 ++in;
576 c = *in;
577 }
578 if (instop - in > outend - out) instop = in + (outend - out);
579 while (c < 0x80 && in < instop) {
580 *out++ = c;
581 ++in;
582 c = *in;
583 }
584 }
585 if (in < inend && out < outend && c < 0x80) {
586 *out++ = c;
587 ++in;
Owen Taylor3473f882001-02-23 17:55:21 +0000588 }
589 *outlen = out - outstart;
Daniel Veillarde72c7562002-05-31 09:47:30 +0000590 *inlen = in - base;
Owen Taylor3473f882001-02-23 17:55:21 +0000591 return(0);
592}
593
Daniel Veillarde72c7562002-05-31 09:47:30 +0000594
Owen Taylor3473f882001-02-23 17:55:21 +0000595/**
596 * UTF8Toisolat1:
597 * @out: a pointer to an array of bytes to store the result
598 * @outlen: the length of @out
599 * @in: a pointer to an array of UTF-8 chars
600 * @inlen: the length of @in
601 *
602 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
603 * block of chars out.
604 *
605 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
606 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000607 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000608 * The value of @outlen after return is the number of ocetes consumed.
609 */
610int
611UTF8Toisolat1(unsigned char* out, int *outlen,
612 const unsigned char* in, int *inlen) {
613 const unsigned char* processed = in;
614 const unsigned char* outend;
615 const unsigned char* outstart = out;
616 const unsigned char* instart = in;
617 const unsigned char* inend;
618 unsigned int c, d;
619 int trailing;
620
621 if (in == NULL) {
622 /*
623 * initialization nothing to do
624 */
625 *outlen = 0;
626 *inlen = 0;
627 return(0);
628 }
629 inend = in + (*inlen);
630 outend = out + (*outlen);
631 while (in < inend) {
632 d = *in++;
633 if (d < 0x80) { c= d; trailing= 0; }
634 else if (d < 0xC0) {
635 /* trailing byte in leading position */
636 *outlen = out - outstart;
637 *inlen = processed - instart;
638 return(-2);
639 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
640 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
641 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
642 else {
643 /* no chance for this in IsoLat1 */
644 *outlen = out - outstart;
645 *inlen = processed - instart;
646 return(-2);
647 }
648
649 if (inend - in < trailing) {
650 break;
651 }
652
653 for ( ; trailing; trailing--) {
654 if (in >= inend)
655 break;
656 if (((d= *in++) & 0xC0) != 0x80) {
657 *outlen = out - outstart;
658 *inlen = processed - instart;
659 return(-2);
660 }
661 c <<= 6;
662 c |= d & 0x3F;
663 }
664
665 /* assertion: c is a single UTF-4 value */
666 if (c <= 0xFF) {
667 if (out >= outend)
668 break;
669 *out++ = c;
670 } else {
671 /* no chance for this in IsoLat1 */
672 *outlen = out - outstart;
673 *inlen = processed - instart;
674 return(-2);
675 }
676 processed = in;
677 }
678 *outlen = out - outstart;
679 *inlen = processed - instart;
680 return(0);
681}
682
683/**
684 * UTF16LEToUTF8:
685 * @out: a pointer to an array of bytes to store the result
686 * @outlen: the length of @out
687 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
688 * @inlenb: the length of @in in UTF-16LE chars
689 *
690 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000691 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000692 * is the same between the native type of this machine and the
693 * inputed one.
694 *
695 * Returns the number of byte written, or -1 by lack of space, or -2
696 * if the transcoding fails (for *in is not valid utf16 string)
697 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000698 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000699 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000700static int
Owen Taylor3473f882001-02-23 17:55:21 +0000701UTF16LEToUTF8(unsigned char* out, int *outlen,
702 const unsigned char* inb, int *inlenb)
703{
704 unsigned char* outstart = out;
705 const unsigned char* processed = inb;
706 unsigned char* outend = out + *outlen;
707 unsigned short* in = (unsigned short*) inb;
708 unsigned short* inend;
709 unsigned int c, d, inlen;
710 unsigned char *tmp;
711 int bits;
712
713 if ((*inlenb % 2) == 1)
714 (*inlenb)--;
715 inlen = *inlenb / 2;
716 inend = in + inlen;
717 while ((in < inend) && (out - outstart + 5 < *outlen)) {
718 if (xmlLittleEndian) {
719 c= *in++;
720 } else {
721 tmp = (unsigned char *) in;
722 c = *tmp++;
723 c = c | (((unsigned int)*tmp) << 8);
724 in++;
725 }
726 if ((c & 0xFC00) == 0xD800) { /* surrogates */
727 if (in >= inend) { /* (in > inend) shouldn't happens */
728 break;
729 }
730 if (xmlLittleEndian) {
731 d = *in++;
732 } else {
733 tmp = (unsigned char *) in;
734 d = *tmp++;
735 d = d | (((unsigned int)*tmp) << 8);
736 in++;
737 }
738 if ((d & 0xFC00) == 0xDC00) {
739 c &= 0x03FF;
740 c <<= 10;
741 c |= d & 0x03FF;
742 c += 0x10000;
743 }
744 else {
745 *outlen = out - outstart;
746 *inlenb = processed - inb;
747 return(-2);
748 }
749 }
750
751 /* assertion: c is a single UTF-4 value */
752 if (out >= outend)
753 break;
754 if (c < 0x80) { *out++= c; bits= -6; }
755 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
756 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
757 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
758
759 for ( ; bits >= 0; bits-= 6) {
760 if (out >= outend)
761 break;
762 *out++= ((c >> bits) & 0x3F) | 0x80;
763 }
764 processed = (const unsigned char*) in;
765 }
766 *outlen = out - outstart;
767 *inlenb = processed - inb;
768 return(0);
769}
770
771/**
772 * UTF8ToUTF16LE:
773 * @outb: a pointer to an array of bytes to store the result
774 * @outlen: the length of @outb
775 * @in: a pointer to an array of UTF-8 chars
776 * @inlen: the length of @in
777 *
778 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
779 * block of chars out.
780 *
781 * Returns the number of byte written, or -1 by lack of space, or -2
782 * if the transcoding failed.
783 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000784static int
Owen Taylor3473f882001-02-23 17:55:21 +0000785UTF8ToUTF16LE(unsigned char* outb, int *outlen,
786 const unsigned char* in, int *inlen)
787{
788 unsigned short* out = (unsigned short*) outb;
789 const unsigned char* processed = in;
790 unsigned short* outstart= out;
791 unsigned short* outend;
792 const unsigned char* inend= in+*inlen;
793 unsigned int c, d;
794 int trailing;
795 unsigned char *tmp;
796 unsigned short tmp1, tmp2;
797
798 if (in == NULL) {
799 /*
800 * initialization, add the Byte Order Mark
801 */
802 if (*outlen >= 2) {
803 outb[0] = 0xFF;
804 outb[1] = 0xFE;
805 *outlen = 2;
806 *inlen = 0;
807#ifdef DEBUG_ENCODING
808 xmlGenericError(xmlGenericErrorContext,
809 "Added FFFE Byte Order Mark\n");
810#endif
811 return(2);
812 }
813 *outlen = 0;
814 *inlen = 0;
815 return(0);
816 }
817 outend = out + (*outlen / 2);
818 while (in < inend) {
819 d= *in++;
820 if (d < 0x80) { c= d; trailing= 0; }
821 else if (d < 0xC0) {
822 /* trailing byte in leading position */
823 *outlen = (out - outstart) * 2;
824 *inlen = processed - in;
825 return(-2);
826 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
827 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
828 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
829 else {
830 /* no chance for this in UTF-16 */
831 *outlen = (out - outstart) * 2;
832 *inlen = processed - in;
833 return(-2);
834 }
835
836 if (inend - in < trailing) {
837 break;
838 }
839
840 for ( ; trailing; trailing--) {
841 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
842 break;
843 c <<= 6;
844 c |= d & 0x3F;
845 }
846
847 /* assertion: c is a single UTF-4 value */
848 if (c < 0x10000) {
849 if (out >= outend)
850 break;
851 if (xmlLittleEndian) {
852 *out++ = c;
853 } else {
854 tmp = (unsigned char *) out;
855 *tmp = c ;
856 *(tmp + 1) = c >> 8 ;
857 out++;
858 }
859 }
860 else if (c < 0x110000) {
861 if (out+1 >= outend)
862 break;
863 c -= 0x10000;
864 if (xmlLittleEndian) {
865 *out++ = 0xD800 | (c >> 10);
866 *out++ = 0xDC00 | (c & 0x03FF);
867 } else {
868 tmp1 = 0xD800 | (c >> 10);
869 tmp = (unsigned char *) out;
870 *tmp = (unsigned char) tmp1;
871 *(tmp + 1) = tmp1 >> 8;
872 out++;
873
874 tmp2 = 0xDC00 | (c & 0x03FF);
875 tmp = (unsigned char *) out;
876 *tmp = (unsigned char) tmp2;
877 *(tmp + 1) = tmp2 >> 8;
878 out++;
879 }
880 }
881 else
882 break;
883 processed = in;
884 }
885 *outlen = (out - outstart) * 2;
886 *inlen = processed - in;
887 return(0);
888}
889
890/**
891 * UTF16BEToUTF8:
892 * @out: a pointer to an array of bytes to store the result
893 * @outlen: the length of @out
894 * @inb: a pointer to an array of UTF-16 passwd as a byte array
895 * @inlenb: the length of @in in UTF-16 chars
896 *
897 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000898 * block of chars out. This function assume the endian property
Owen Taylor3473f882001-02-23 17:55:21 +0000899 * is the same between the native type of this machine and the
900 * inputed one.
901 *
902 * Returns the number of byte written, or -1 by lack of space, or -2
903 * if the transcoding fails (for *in is not valid utf16 string)
904 * The value of *inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +0000905 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +0000906 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000907static int
Owen Taylor3473f882001-02-23 17:55:21 +0000908UTF16BEToUTF8(unsigned char* out, int *outlen,
909 const unsigned char* inb, int *inlenb)
910{
911 unsigned char* outstart = out;
912 const unsigned char* processed = inb;
913 unsigned char* outend = out + *outlen;
914 unsigned short* in = (unsigned short*) inb;
915 unsigned short* inend;
916 unsigned int c, d, inlen;
917 unsigned char *tmp;
918 int bits;
919
920 if ((*inlenb % 2) == 1)
921 (*inlenb)--;
922 inlen = *inlenb / 2;
923 inend= in + inlen;
924 while (in < inend) {
925 if (xmlLittleEndian) {
926 tmp = (unsigned char *) in;
927 c = *tmp++;
928 c = c << 8;
929 c = c | (unsigned int) *tmp;
930 in++;
931 } else {
932 c= *in++;
933 }
934 if ((c & 0xFC00) == 0xD800) { /* surrogates */
935 if (in >= inend) { /* (in > inend) shouldn't happens */
936 *outlen = out - outstart;
937 *inlenb = processed - inb;
938 return(-2);
939 }
940 if (xmlLittleEndian) {
941 tmp = (unsigned char *) in;
942 d = *tmp++;
943 d = d << 8;
944 d = d | (unsigned int) *tmp;
945 in++;
946 } else {
947 d= *in++;
948 }
949 if ((d & 0xFC00) == 0xDC00) {
950 c &= 0x03FF;
951 c <<= 10;
952 c |= d & 0x03FF;
953 c += 0x10000;
954 }
955 else {
956 *outlen = out - outstart;
957 *inlenb = processed - inb;
958 return(-2);
959 }
960 }
961
962 /* assertion: c is a single UTF-4 value */
963 if (out >= outend)
964 break;
965 if (c < 0x80) { *out++= c; bits= -6; }
966 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
967 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
968 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
969
970 for ( ; bits >= 0; bits-= 6) {
971 if (out >= outend)
972 break;
973 *out++= ((c >> bits) & 0x3F) | 0x80;
974 }
975 processed = (const unsigned char*) in;
976 }
977 *outlen = out - outstart;
978 *inlenb = processed - inb;
979 return(0);
980}
981
982/**
983 * UTF8ToUTF16BE:
984 * @outb: a pointer to an array of bytes to store the result
985 * @outlen: the length of @outb
986 * @in: a pointer to an array of UTF-8 chars
987 * @inlen: the length of @in
988 *
989 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
990 * block of chars out.
991 *
992 * Returns the number of byte written, or -1 by lack of space, or -2
993 * if the transcoding failed.
994 */
Daniel Veillard56a4cb82001-03-24 17:00:36 +0000995static int
Owen Taylor3473f882001-02-23 17:55:21 +0000996UTF8ToUTF16BE(unsigned char* outb, int *outlen,
997 const unsigned char* in, int *inlen)
998{
999 unsigned short* out = (unsigned short*) outb;
1000 const unsigned char* processed = in;
1001 unsigned short* outstart= out;
1002 unsigned short* outend;
1003 const unsigned char* inend= in+*inlen;
1004 unsigned int c, d;
1005 int trailing;
1006 unsigned char *tmp;
1007 unsigned short tmp1, tmp2;
1008
1009 if (in == NULL) {
1010 /*
1011 * initialization, add the Byte Order Mark
1012 */
1013 if (*outlen >= 2) {
1014 outb[0] = 0xFE;
1015 outb[1] = 0xFF;
1016 *outlen = 2;
1017 *inlen = 0;
1018#ifdef DEBUG_ENCODING
1019 xmlGenericError(xmlGenericErrorContext,
1020 "Added FEFF Byte Order Mark\n");
1021#endif
1022 return(2);
1023 }
1024 *outlen = 0;
1025 *inlen = 0;
1026 return(0);
1027 }
1028 outend = out + (*outlen / 2);
1029 while (in < inend) {
1030 d= *in++;
1031 if (d < 0x80) { c= d; trailing= 0; }
1032 else if (d < 0xC0) {
1033 /* trailing byte in leading position */
1034 *outlen = out - outstart;
1035 *inlen = processed - in;
1036 return(-2);
1037 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1038 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1039 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1040 else {
1041 /* no chance for this in UTF-16 */
1042 *outlen = out - outstart;
1043 *inlen = processed - in;
1044 return(-2);
1045 }
1046
1047 if (inend - in < trailing) {
1048 break;
1049 }
1050
1051 for ( ; trailing; trailing--) {
1052 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1053 c <<= 6;
1054 c |= d & 0x3F;
1055 }
1056
1057 /* assertion: c is a single UTF-4 value */
1058 if (c < 0x10000) {
1059 if (out >= outend) break;
1060 if (xmlLittleEndian) {
1061 tmp = (unsigned char *) out;
1062 *tmp = c >> 8;
1063 *(tmp + 1) = c;
1064 out++;
1065 } else {
1066 *out++ = c;
1067 }
1068 }
1069 else if (c < 0x110000) {
1070 if (out+1 >= outend) break;
1071 c -= 0x10000;
1072 if (xmlLittleEndian) {
1073 tmp1 = 0xD800 | (c >> 10);
1074 tmp = (unsigned char *) out;
1075 *tmp = tmp1 >> 8;
1076 *(tmp + 1) = (unsigned char) tmp1;
1077 out++;
1078
1079 tmp2 = 0xDC00 | (c & 0x03FF);
1080 tmp = (unsigned char *) out;
1081 *tmp = tmp2 >> 8;
1082 *(tmp + 1) = (unsigned char) tmp2;
1083 out++;
1084 } else {
1085 *out++ = 0xD800 | (c >> 10);
1086 *out++ = 0xDC00 | (c & 0x03FF);
1087 }
1088 }
1089 else
1090 break;
1091 processed = in;
1092 }
1093 *outlen = (out - outstart) * 2;
1094 *inlen = processed - in;
1095 return(0);
1096}
1097
Daniel Veillard97ac1312001-05-30 19:14:17 +00001098/************************************************************************
1099 * *
1100 * Generic encoding handling routines *
1101 * *
1102 ************************************************************************/
1103
Owen Taylor3473f882001-02-23 17:55:21 +00001104/**
1105 * xmlDetectCharEncoding:
1106 * @in: a pointer to the first bytes of the XML entity, must be at least
1107 * 4 bytes long.
1108 * @len: pointer to the length of the buffer
1109 *
1110 * Guess the encoding of the entity using the first bytes of the entity content
1111 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1112 *
1113 * Returns one of the XML_CHAR_ENCODING_... values.
1114 */
1115xmlCharEncoding
1116xmlDetectCharEncoding(const unsigned char* in, int len)
1117{
1118 if (len >= 4) {
1119 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1120 (in[2] == 0x00) && (in[3] == 0x3C))
1121 return(XML_CHAR_ENCODING_UCS4BE);
1122 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1123 (in[2] == 0x00) && (in[3] == 0x00))
1124 return(XML_CHAR_ENCODING_UCS4LE);
1125 if ((in[0] == 0x00) && (in[1] == 0x00) &&
1126 (in[2] == 0x3C) && (in[3] == 0x00))
1127 return(XML_CHAR_ENCODING_UCS4_2143);
1128 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1129 (in[2] == 0x00) && (in[3] == 0x00))
1130 return(XML_CHAR_ENCODING_UCS4_3412);
1131 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1132 (in[2] == 0xA7) && (in[3] == 0x94))
1133 return(XML_CHAR_ENCODING_EBCDIC);
1134 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1135 (in[2] == 0x78) && (in[3] == 0x6D))
1136 return(XML_CHAR_ENCODING_UTF8);
1137 }
Daniel Veillard87a764e2001-06-20 17:41:10 +00001138 if (len >= 3) {
1139 /*
1140 * Errata on XML-1.0 June 20 2001
1141 * We now allow an UTF8 encoded BOM
1142 */
1143 if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1144 (in[2] == 0xBF))
1145 return(XML_CHAR_ENCODING_UTF8);
1146 }
Owen Taylor3473f882001-02-23 17:55:21 +00001147 if (len >= 2) {
1148 if ((in[0] == 0xFE) && (in[1] == 0xFF))
1149 return(XML_CHAR_ENCODING_UTF16BE);
1150 if ((in[0] == 0xFF) && (in[1] == 0xFE))
1151 return(XML_CHAR_ENCODING_UTF16LE);
1152 }
1153 return(XML_CHAR_ENCODING_NONE);
1154}
1155
1156/**
1157 * xmlCleanupEncodingAliases:
1158 *
1159 * Unregisters all aliases
1160 */
1161void
1162xmlCleanupEncodingAliases(void) {
1163 int i;
1164
1165 if (xmlCharEncodingAliases == NULL)
1166 return;
1167
1168 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1169 if (xmlCharEncodingAliases[i].name != NULL)
1170 xmlFree((char *) xmlCharEncodingAliases[i].name);
1171 if (xmlCharEncodingAliases[i].alias != NULL)
1172 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1173 }
1174 xmlCharEncodingAliasesNb = 0;
1175 xmlCharEncodingAliasesMax = 0;
1176 xmlFree(xmlCharEncodingAliases);
Daniel Veillard73c6e532002-01-08 13:15:33 +00001177 xmlCharEncodingAliases = NULL;
Owen Taylor3473f882001-02-23 17:55:21 +00001178}
1179
1180/**
1181 * xmlGetEncodingAlias:
1182 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1183 *
1184 * Lookup an encoding name for the given alias.
1185 *
1186 * Returns NULL if not found the original name otherwise
1187 */
1188const char *
1189xmlGetEncodingAlias(const char *alias) {
1190 int i;
1191 char upper[100];
1192
1193 if (alias == NULL)
1194 return(NULL);
1195
1196 if (xmlCharEncodingAliases == NULL)
1197 return(NULL);
1198
1199 for (i = 0;i < 99;i++) {
1200 upper[i] = toupper(alias[i]);
1201 if (upper[i] == 0) break;
1202 }
1203 upper[i] = 0;
1204
1205 /*
1206 * Walk down the list looking for a definition of the alias
1207 */
1208 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1209 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1210 return(xmlCharEncodingAliases[i].name);
1211 }
1212 }
1213 return(NULL);
1214}
1215
1216/**
1217 * xmlAddEncodingAlias:
1218 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1219 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1220 *
1221 * Registers and alias @alias for an encoding named @name. Existing alias
1222 * will be overwritten.
1223 *
1224 * Returns 0 in case of success, -1 in case of error
1225 */
1226int
1227xmlAddEncodingAlias(const char *name, const char *alias) {
1228 int i;
1229 char upper[100];
1230
1231 if ((name == NULL) || (alias == NULL))
1232 return(-1);
1233
1234 for (i = 0;i < 99;i++) {
1235 upper[i] = toupper(alias[i]);
1236 if (upper[i] == 0) break;
1237 }
1238 upper[i] = 0;
1239
1240 if (xmlCharEncodingAliases == NULL) {
1241 xmlCharEncodingAliasesNb = 0;
1242 xmlCharEncodingAliasesMax = 20;
1243 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1244 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1245 if (xmlCharEncodingAliases == NULL)
1246 return(-1);
1247 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1248 xmlCharEncodingAliasesMax *= 2;
1249 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1250 xmlRealloc(xmlCharEncodingAliases,
1251 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1252 }
1253 /*
1254 * Walk down the list looking for a definition of the alias
1255 */
1256 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1257 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1258 /*
1259 * Replace the definition.
1260 */
1261 xmlFree((char *) xmlCharEncodingAliases[i].name);
1262 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1263 return(0);
1264 }
1265 }
1266 /*
1267 * Add the definition
1268 */
1269 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1270 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1271 xmlCharEncodingAliasesNb++;
1272 return(0);
1273}
1274
1275/**
1276 * xmlDelEncodingAlias:
1277 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1278 *
1279 * Unregisters an encoding alias @alias
1280 *
1281 * Returns 0 in case of success, -1 in case of error
1282 */
1283int
1284xmlDelEncodingAlias(const char *alias) {
1285 int i;
1286
1287 if (alias == NULL)
1288 return(-1);
1289
1290 if (xmlCharEncodingAliases == NULL)
1291 return(-1);
1292 /*
1293 * Walk down the list looking for a definition of the alias
1294 */
1295 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1296 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1297 xmlFree((char *) xmlCharEncodingAliases[i].name);
1298 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1299 xmlCharEncodingAliasesNb--;
1300 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1301 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1302 return(0);
1303 }
1304 }
1305 return(-1);
1306}
1307
1308/**
1309 * xmlParseCharEncoding:
1310 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1311 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001312 * Compare the string to the known encoding schemes already known. Note
Owen Taylor3473f882001-02-23 17:55:21 +00001313 * that the comparison is case insensitive accordingly to the section
1314 * [XML] 4.3.3 Character Encoding in Entities.
1315 *
1316 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1317 * if not recognized.
1318 */
1319xmlCharEncoding
1320xmlParseCharEncoding(const char* name)
1321{
1322 const char *alias;
1323 char upper[500];
1324 int i;
1325
1326 if (name == NULL)
1327 return(XML_CHAR_ENCODING_NONE);
1328
1329 /*
1330 * Do the alias resolution
1331 */
1332 alias = xmlGetEncodingAlias(name);
1333 if (alias != NULL)
1334 name = alias;
1335
1336 for (i = 0;i < 499;i++) {
1337 upper[i] = toupper(name[i]);
1338 if (upper[i] == 0) break;
1339 }
1340 upper[i] = 0;
1341
1342 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1343 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1344 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1345
1346 /*
1347 * NOTE: if we were able to parse this, the endianness of UTF16 is
1348 * already found and in use
1349 */
1350 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1351 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1352
1353 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1354 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1355 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1356
1357 /*
1358 * NOTE: if we were able to parse this, the endianness of UCS4 is
1359 * already found and in use
1360 */
1361 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1362 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1363 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1364
1365
1366 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1367 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1368 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1369
1370 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1371 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1372 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1373
1374 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1375 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1376 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1377 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1378 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1379 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1380 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1381
1382 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1383 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1384 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1385
1386#ifdef DEBUG_ENCODING
1387 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1388#endif
1389 return(XML_CHAR_ENCODING_ERROR);
1390}
1391
1392/**
1393 * xmlGetCharEncodingName:
1394 * @enc: the encoding
1395 *
1396 * The "canonical" name for XML encoding.
1397 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1398 * Section 4.3.3 Character Encoding in Entities
1399 *
1400 * Returns the canonical name for the given encoding
1401 */
1402
1403const char*
1404xmlGetCharEncodingName(xmlCharEncoding enc) {
1405 switch (enc) {
1406 case XML_CHAR_ENCODING_ERROR:
1407 return(NULL);
1408 case XML_CHAR_ENCODING_NONE:
1409 return(NULL);
1410 case XML_CHAR_ENCODING_UTF8:
1411 return("UTF-8");
1412 case XML_CHAR_ENCODING_UTF16LE:
1413 return("UTF-16");
1414 case XML_CHAR_ENCODING_UTF16BE:
1415 return("UTF-16");
1416 case XML_CHAR_ENCODING_EBCDIC:
1417 return("EBCDIC");
1418 case XML_CHAR_ENCODING_UCS4LE:
1419 return("ISO-10646-UCS-4");
1420 case XML_CHAR_ENCODING_UCS4BE:
1421 return("ISO-10646-UCS-4");
1422 case XML_CHAR_ENCODING_UCS4_2143:
1423 return("ISO-10646-UCS-4");
1424 case XML_CHAR_ENCODING_UCS4_3412:
1425 return("ISO-10646-UCS-4");
1426 case XML_CHAR_ENCODING_UCS2:
1427 return("ISO-10646-UCS-2");
1428 case XML_CHAR_ENCODING_8859_1:
1429 return("ISO-8859-1");
1430 case XML_CHAR_ENCODING_8859_2:
1431 return("ISO-8859-2");
1432 case XML_CHAR_ENCODING_8859_3:
1433 return("ISO-8859-3");
1434 case XML_CHAR_ENCODING_8859_4:
1435 return("ISO-8859-4");
1436 case XML_CHAR_ENCODING_8859_5:
1437 return("ISO-8859-5");
1438 case XML_CHAR_ENCODING_8859_6:
1439 return("ISO-8859-6");
1440 case XML_CHAR_ENCODING_8859_7:
1441 return("ISO-8859-7");
1442 case XML_CHAR_ENCODING_8859_8:
1443 return("ISO-8859-8");
1444 case XML_CHAR_ENCODING_8859_9:
1445 return("ISO-8859-9");
1446 case XML_CHAR_ENCODING_2022_JP:
1447 return("ISO-2022-JP");
1448 case XML_CHAR_ENCODING_SHIFT_JIS:
1449 return("Shift-JIS");
1450 case XML_CHAR_ENCODING_EUC_JP:
1451 return("EUC-JP");
1452 case XML_CHAR_ENCODING_ASCII:
1453 return(NULL);
1454 }
1455 return(NULL);
1456}
1457
Daniel Veillard97ac1312001-05-30 19:14:17 +00001458/************************************************************************
1459 * *
1460 * Char encoding handlers *
1461 * *
1462 ************************************************************************/
1463
Owen Taylor3473f882001-02-23 17:55:21 +00001464
1465/* the size should be growable, but it's not a big deal ... */
1466#define MAX_ENCODING_HANDLERS 50
1467static xmlCharEncodingHandlerPtr *handlers = NULL;
1468static int nbCharEncodingHandler = 0;
1469
1470/*
1471 * The default is UTF-8 for XML, that's also the default used for the
1472 * parser internals, so the default encoding handler is NULL
1473 */
1474
1475static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1476
1477/**
1478 * xmlNewCharEncodingHandler:
1479 * @name: the encoding name, in UTF-8 format (ASCII actually)
1480 * @input: the xmlCharEncodingInputFunc to read that encoding
1481 * @output: the xmlCharEncodingOutputFunc to write that encoding
1482 *
1483 * Create and registers an xmlCharEncodingHandler.
Daniel Veillard6f46f6c2002-08-01 12:22:24 +00001484 *
Owen Taylor3473f882001-02-23 17:55:21 +00001485 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1486 */
Daniel Veillard6f46f6c2002-08-01 12:22:24 +00001487xmlCharEncodingHandlerPtr
Owen Taylor3473f882001-02-23 17:55:21 +00001488xmlNewCharEncodingHandler(const char *name,
1489 xmlCharEncodingInputFunc input,
1490 xmlCharEncodingOutputFunc output) {
1491 xmlCharEncodingHandlerPtr handler;
1492 const char *alias;
1493 char upper[500];
1494 int i;
1495 char *up = 0;
1496
1497 /*
1498 * Do the alias resolution
1499 */
1500 alias = xmlGetEncodingAlias(name);
1501 if (alias != NULL)
1502 name = alias;
1503
1504 /*
1505 * Keep only the uppercase version of the encoding.
1506 */
1507 if (name == NULL) {
1508 xmlGenericError(xmlGenericErrorContext,
1509 "xmlNewCharEncodingHandler : no name !\n");
1510 return(NULL);
1511 }
1512 for (i = 0;i < 499;i++) {
1513 upper[i] = toupper(name[i]);
1514 if (upper[i] == 0) break;
1515 }
1516 upper[i] = 0;
1517 up = xmlMemStrdup(upper);
1518 if (up == NULL) {
1519 xmlGenericError(xmlGenericErrorContext,
1520 "xmlNewCharEncodingHandler : out of memory !\n");
1521 return(NULL);
1522 }
1523
1524 /*
1525 * allocate and fill-up an handler block.
1526 */
1527 handler = (xmlCharEncodingHandlerPtr)
1528 xmlMalloc(sizeof(xmlCharEncodingHandler));
1529 if (handler == NULL) {
1530 xmlGenericError(xmlGenericErrorContext,
1531 "xmlNewCharEncodingHandler : out of memory !\n");
1532 return(NULL);
1533 }
1534 handler->input = input;
1535 handler->output = output;
1536 handler->name = up;
1537
1538#ifdef LIBXML_ICONV_ENABLED
1539 handler->iconv_in = NULL;
1540 handler->iconv_out = NULL;
1541#endif /* LIBXML_ICONV_ENABLED */
1542
1543 /*
1544 * registers and returns the handler.
1545 */
1546 xmlRegisterCharEncodingHandler(handler);
1547#ifdef DEBUG_ENCODING
1548 xmlGenericError(xmlGenericErrorContext,
1549 "Registered encoding handler for %s\n", name);
1550#endif
1551 return(handler);
1552}
1553
1554/**
1555 * xmlInitCharEncodingHandlers:
1556 *
1557 * Initialize the char encoding support, it registers the default
1558 * encoding supported.
1559 * NOTE: while public, this function usually doesn't need to be called
1560 * in normal processing.
1561 */
1562void
1563xmlInitCharEncodingHandlers(void) {
1564 unsigned short int tst = 0x1234;
1565 unsigned char *ptr = (unsigned char *) &tst;
1566
1567 if (handlers != NULL) return;
1568
1569 handlers = (xmlCharEncodingHandlerPtr *)
1570 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1571
1572 if (*ptr == 0x12) xmlLittleEndian = 0;
1573 else if (*ptr == 0x34) xmlLittleEndian = 1;
1574 else xmlGenericError(xmlGenericErrorContext,
1575 "Odd problem at endianness detection\n");
1576
1577 if (handlers == NULL) {
1578 xmlGenericError(xmlGenericErrorContext,
1579 "xmlInitCharEncodingHandlers : out of memory !\n");
1580 return;
1581 }
1582 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1583 xmlUTF16LEHandler =
1584 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1585 xmlUTF16BEHandler =
1586 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1587 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1588 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
Daniel Veillard20042422001-05-31 18:22:04 +00001589 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
Owen Taylor3473f882001-02-23 17:55:21 +00001590#ifdef LIBXML_HTML_ENABLED
1591 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1592#endif
1593}
1594
1595/**
1596 * xmlCleanupCharEncodingHandlers:
1597 *
1598 * Cleanup the memory allocated for the char encoding support, it
1599 * unregisters all the encoding handlers and the aliases.
1600 */
1601void
1602xmlCleanupCharEncodingHandlers(void) {
1603 xmlCleanupEncodingAliases();
1604
1605 if (handlers == NULL) return;
1606
1607 for (;nbCharEncodingHandler > 0;) {
1608 nbCharEncodingHandler--;
1609 if (handlers[nbCharEncodingHandler] != NULL) {
1610 if (handlers[nbCharEncodingHandler]->name != NULL)
1611 xmlFree(handlers[nbCharEncodingHandler]->name);
1612 xmlFree(handlers[nbCharEncodingHandler]);
1613 }
1614 }
1615 xmlFree(handlers);
1616 handlers = NULL;
1617 nbCharEncodingHandler = 0;
1618 xmlDefaultCharEncodingHandler = NULL;
1619}
1620
1621/**
1622 * xmlRegisterCharEncodingHandler:
1623 * @handler: the xmlCharEncodingHandlerPtr handler block
1624 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001625 * Register the char encoding handler, surprising, isn't it ?
Owen Taylor3473f882001-02-23 17:55:21 +00001626 */
1627void
1628xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1629 if (handlers == NULL) xmlInitCharEncodingHandlers();
1630 if (handler == NULL) {
1631 xmlGenericError(xmlGenericErrorContext,
1632 "xmlRegisterCharEncodingHandler: NULL handler !\n");
1633 return;
1634 }
1635
1636 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1637 xmlGenericError(xmlGenericErrorContext,
1638 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1639 xmlGenericError(xmlGenericErrorContext,
1640 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1641 return;
1642 }
1643 handlers[nbCharEncodingHandler++] = handler;
1644}
1645
1646/**
1647 * xmlGetCharEncodingHandler:
1648 * @enc: an xmlCharEncoding value.
1649 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001650 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001651 *
1652 * Returns the handler or NULL if not found
1653 */
1654xmlCharEncodingHandlerPtr
1655xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1656 xmlCharEncodingHandlerPtr handler;
1657
1658 if (handlers == NULL) xmlInitCharEncodingHandlers();
1659 switch (enc) {
1660 case XML_CHAR_ENCODING_ERROR:
1661 return(NULL);
1662 case XML_CHAR_ENCODING_NONE:
1663 return(NULL);
1664 case XML_CHAR_ENCODING_UTF8:
1665 return(NULL);
1666 case XML_CHAR_ENCODING_UTF16LE:
1667 return(xmlUTF16LEHandler);
1668 case XML_CHAR_ENCODING_UTF16BE:
1669 return(xmlUTF16BEHandler);
1670 case XML_CHAR_ENCODING_EBCDIC:
1671 handler = xmlFindCharEncodingHandler("EBCDIC");
1672 if (handler != NULL) return(handler);
1673 handler = xmlFindCharEncodingHandler("ebcdic");
1674 if (handler != NULL) return(handler);
1675 break;
1676 case XML_CHAR_ENCODING_UCS4BE:
1677 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1678 if (handler != NULL) return(handler);
1679 handler = xmlFindCharEncodingHandler("UCS-4");
1680 if (handler != NULL) return(handler);
1681 handler = xmlFindCharEncodingHandler("UCS4");
1682 if (handler != NULL) return(handler);
1683 break;
1684 case XML_CHAR_ENCODING_UCS4LE:
1685 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1686 if (handler != NULL) return(handler);
1687 handler = xmlFindCharEncodingHandler("UCS-4");
1688 if (handler != NULL) return(handler);
1689 handler = xmlFindCharEncodingHandler("UCS4");
1690 if (handler != NULL) return(handler);
1691 break;
1692 case XML_CHAR_ENCODING_UCS4_2143:
1693 break;
1694 case XML_CHAR_ENCODING_UCS4_3412:
1695 break;
1696 case XML_CHAR_ENCODING_UCS2:
1697 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1698 if (handler != NULL) return(handler);
1699 handler = xmlFindCharEncodingHandler("UCS-2");
1700 if (handler != NULL) return(handler);
1701 handler = xmlFindCharEncodingHandler("UCS2");
1702 if (handler != NULL) return(handler);
1703 break;
1704
1705 /*
1706 * We used to keep ISO Latin encodings native in the
1707 * generated data. This led to so many problems that
1708 * this has been removed. One can still change this
1709 * back by registering no-ops encoders for those
1710 */
1711 case XML_CHAR_ENCODING_8859_1:
1712 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1713 if (handler != NULL) return(handler);
1714 break;
1715 case XML_CHAR_ENCODING_8859_2:
1716 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1717 if (handler != NULL) return(handler);
1718 break;
1719 case XML_CHAR_ENCODING_8859_3:
1720 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1721 if (handler != NULL) return(handler);
1722 break;
1723 case XML_CHAR_ENCODING_8859_4:
1724 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1725 if (handler != NULL) return(handler);
1726 break;
1727 case XML_CHAR_ENCODING_8859_5:
1728 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1729 if (handler != NULL) return(handler);
1730 break;
1731 case XML_CHAR_ENCODING_8859_6:
1732 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1733 if (handler != NULL) return(handler);
1734 break;
1735 case XML_CHAR_ENCODING_8859_7:
1736 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1737 if (handler != NULL) return(handler);
1738 break;
1739 case XML_CHAR_ENCODING_8859_8:
1740 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1741 if (handler != NULL) return(handler);
1742 break;
1743 case XML_CHAR_ENCODING_8859_9:
1744 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1745 if (handler != NULL) return(handler);
1746 break;
1747
1748
1749 case XML_CHAR_ENCODING_2022_JP:
1750 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1751 if (handler != NULL) return(handler);
1752 break;
1753 case XML_CHAR_ENCODING_SHIFT_JIS:
1754 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1755 if (handler != NULL) return(handler);
1756 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1757 if (handler != NULL) return(handler);
1758 handler = xmlFindCharEncodingHandler("Shift_JIS");
1759 if (handler != NULL) return(handler);
1760 break;
1761 case XML_CHAR_ENCODING_EUC_JP:
1762 handler = xmlFindCharEncodingHandler("EUC-JP");
1763 if (handler != NULL) return(handler);
1764 break;
1765 default:
1766 break;
1767 }
1768
1769#ifdef DEBUG_ENCODING
1770 xmlGenericError(xmlGenericErrorContext,
1771 "No handler found for encoding %d\n", enc);
1772#endif
1773 return(NULL);
1774}
1775
1776/**
Daniel Veillard5e2dace2001-07-18 19:30:27 +00001777 * xmlFindCharEncodingHandler:
1778 * @name: a string describing the char encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001779 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001780 * Search in the registered set the handler able to read/write that encoding.
Owen Taylor3473f882001-02-23 17:55:21 +00001781 *
1782 * Returns the handler or NULL if not found
1783 */
1784xmlCharEncodingHandlerPtr
1785xmlFindCharEncodingHandler(const char *name) {
1786 const char *nalias;
1787 const char *norig;
1788 xmlCharEncoding alias;
1789#ifdef LIBXML_ICONV_ENABLED
1790 xmlCharEncodingHandlerPtr enc;
1791 iconv_t icv_in, icv_out;
1792#endif /* LIBXML_ICONV_ENABLED */
1793 char upper[100];
1794 int i;
1795
1796 if (handlers == NULL) xmlInitCharEncodingHandlers();
1797 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1798 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1799
1800 /*
1801 * Do the alias resolution
1802 */
1803 norig = name;
1804 nalias = xmlGetEncodingAlias(name);
1805 if (nalias != NULL)
1806 name = nalias;
1807
1808 /*
1809 * Check first for directly registered encoding names
1810 */
1811 for (i = 0;i < 99;i++) {
1812 upper[i] = toupper(name[i]);
1813 if (upper[i] == 0) break;
1814 }
1815 upper[i] = 0;
1816
1817 for (i = 0;i < nbCharEncodingHandler; i++)
1818 if (!strcmp(upper, handlers[i]->name)) {
1819#ifdef DEBUG_ENCODING
1820 xmlGenericError(xmlGenericErrorContext,
1821 "Found registered handler for encoding %s\n", name);
1822#endif
1823 return(handlers[i]);
1824 }
1825
1826#ifdef LIBXML_ICONV_ENABLED
1827 /* check whether iconv can handle this */
1828 icv_in = iconv_open("UTF-8", name);
1829 icv_out = iconv_open(name, "UTF-8");
1830 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1831 enc = (xmlCharEncodingHandlerPtr)
1832 xmlMalloc(sizeof(xmlCharEncodingHandler));
1833 if (enc == NULL) {
1834 iconv_close(icv_in);
1835 iconv_close(icv_out);
1836 return(NULL);
1837 }
1838 enc->name = xmlMemStrdup(name);
1839 enc->input = NULL;
1840 enc->output = NULL;
1841 enc->iconv_in = icv_in;
1842 enc->iconv_out = icv_out;
1843#ifdef DEBUG_ENCODING
1844 xmlGenericError(xmlGenericErrorContext,
1845 "Found iconv handler for encoding %s\n", name);
1846#endif
1847 return enc;
1848 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1849 xmlGenericError(xmlGenericErrorContext,
1850 "iconv : problems with filters for '%s'\n", name);
1851 }
1852#endif /* LIBXML_ICONV_ENABLED */
1853
1854#ifdef DEBUG_ENCODING
1855 xmlGenericError(xmlGenericErrorContext,
1856 "No handler found for encoding %s\n", name);
1857#endif
1858
1859 /*
1860 * Fallback using the canonical names
1861 */
1862 alias = xmlParseCharEncoding(norig);
1863 if (alias != XML_CHAR_ENCODING_ERROR) {
1864 const char* canon;
1865 canon = xmlGetCharEncodingName(alias);
1866 if ((canon != NULL) && (strcmp(name, canon))) {
1867 return(xmlFindCharEncodingHandler(canon));
1868 }
1869 }
1870
1871 return(NULL);
1872}
1873
Daniel Veillard97ac1312001-05-30 19:14:17 +00001874/************************************************************************
1875 * *
1876 * ICONV based generic conversion functions *
1877 * *
1878 ************************************************************************/
1879
Owen Taylor3473f882001-02-23 17:55:21 +00001880#ifdef LIBXML_ICONV_ENABLED
1881/**
1882 * xmlIconvWrapper:
1883 * @cd: iconv converter data structure
1884 * @out: a pointer to an array of bytes to store the result
1885 * @outlen: the length of @out
1886 * @in: a pointer to an array of ISO Latin 1 chars
1887 * @inlen: the length of @in
1888 *
1889 * Returns 0 if success, or
1890 * -1 by lack of space, or
1891 * -2 if the transcoding fails (for *in is not valid utf8 string or
1892 * the result of transformation can't fit into the encoding we want), or
1893 * -3 if there the last byte can't form a single output char.
1894 *
1895 * The value of @inlen after return is the number of octets consumed
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001896 * as the return value is positive, else unpredictable.
Owen Taylor3473f882001-02-23 17:55:21 +00001897 * The value of @outlen after return is the number of ocetes consumed.
1898 */
1899static int
1900xmlIconvWrapper(iconv_t cd,
Daniel Veillard9403a042001-05-28 11:00:53 +00001901 unsigned char *out, int *outlen,
1902 const unsigned char *in, int *inlen) {
Owen Taylor3473f882001-02-23 17:55:21 +00001903
Daniel Veillard9403a042001-05-28 11:00:53 +00001904 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1905 const char *icv_in = (const char *) in;
1906 char *icv_out = (char *) out;
1907 int ret;
Owen Taylor3473f882001-02-23 17:55:21 +00001908
Darin Adler699613b2001-07-27 22:47:14 +00001909 ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
Daniel Veillard9403a042001-05-28 11:00:53 +00001910 if (in != NULL) {
1911 *inlen -= icv_inlen;
1912 *outlen -= icv_outlen;
1913 } else {
1914 *inlen = 0;
1915 *outlen = 0;
1916 }
1917 if ((icv_inlen != 0) || (ret == -1)) {
Owen Taylor3473f882001-02-23 17:55:21 +00001918#ifdef EILSEQ
Daniel Veillard9403a042001-05-28 11:00:53 +00001919 if (errno == EILSEQ) {
1920 return -2;
1921 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001922#endif
1923#ifdef E2BIG
Daniel Veillard9403a042001-05-28 11:00:53 +00001924 if (errno == E2BIG) {
1925 return -1;
1926 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001927#endif
1928#ifdef EINVAL
Daniel Veillard9403a042001-05-28 11:00:53 +00001929 if (errno == EINVAL) {
1930 return -3;
1931 } else
Owen Taylor3473f882001-02-23 17:55:21 +00001932#endif
Daniel Veillard9403a042001-05-28 11:00:53 +00001933 {
1934 return -3;
1935 }
1936 }
1937 return 0;
Owen Taylor3473f882001-02-23 17:55:21 +00001938}
1939#endif /* LIBXML_ICONV_ENABLED */
1940
Daniel Veillard97ac1312001-05-30 19:14:17 +00001941/************************************************************************
1942 * *
1943 * The real API used by libxml for on-the-fly conversion *
1944 * *
1945 ************************************************************************/
1946
Owen Taylor3473f882001-02-23 17:55:21 +00001947/**
1948 * xmlCharEncFirstLine:
1949 * @handler: char enconding transformation data structure
1950 * @out: an xmlBuffer for the output.
1951 * @in: an xmlBuffer for the input
1952 *
1953 * Front-end for the encoding handler input function, but handle only
1954 * the very first line, i.e. limit itself to 45 chars.
1955 *
1956 * Returns the number of byte written if success, or
1957 * -1 general error
1958 * -2 if the transcoding fails (for *in is not valid utf8 string or
1959 * the result of transformation can't fit into the encoding we want), or
1960 */
1961int
1962xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1963 xmlBufferPtr in) {
1964 int ret = -2;
1965 int written;
1966 int toconv;
1967
1968 if (handler == NULL) return(-1);
1969 if (out == NULL) return(-1);
1970 if (in == NULL) return(-1);
1971
1972 written = out->size - out->use;
1973 toconv = in->use;
1974 if (toconv * 2 >= written) {
1975 xmlBufferGrow(out, toconv);
1976 written = out->size - out->use - 1;
1977 }
1978
1979 /*
1980 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1981 * 45 chars should be sufficient to reach the end of the encoding
Daniel Veillardcbaf3992001-12-31 16:16:02 +00001982 * declaration without going too far inside the document content.
Owen Taylor3473f882001-02-23 17:55:21 +00001983 */
1984 written = 45;
1985
1986 if (handler->input != NULL) {
1987 ret = handler->input(&out->content[out->use], &written,
1988 in->content, &toconv);
1989 xmlBufferShrink(in, toconv);
1990 out->use += written;
1991 out->content[out->use] = 0;
1992 }
1993#ifdef LIBXML_ICONV_ENABLED
1994 else if (handler->iconv_in != NULL) {
1995 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1996 &written, in->content, &toconv);
1997 xmlBufferShrink(in, toconv);
1998 out->use += written;
1999 out->content[out->use] = 0;
2000 if (ret == -1) ret = -3;
2001 }
2002#endif /* LIBXML_ICONV_ENABLED */
2003#ifdef DEBUG_ENCODING
2004 switch (ret) {
2005 case 0:
2006 xmlGenericError(xmlGenericErrorContext,
2007 "converted %d bytes to %d bytes of input\n",
2008 toconv, written);
2009 break;
2010 case -1:
2011 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2012 toconv, written, in->use);
2013 break;
2014 case -2:
2015 xmlGenericError(xmlGenericErrorContext,
2016 "input conversion failed due to input error\n");
2017 break;
2018 case -3:
2019 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2020 toconv, written, in->use);
2021 break;
2022 default:
2023 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2024 }
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002025#endif /* DEBUG_ENCODING */
Owen Taylor3473f882001-02-23 17:55:21 +00002026 /*
2027 * Ignore when input buffer is not on a boundary
2028 */
2029 if (ret == -3) ret = 0;
2030 if (ret == -1) ret = 0;
2031 return(ret);
2032}
2033
2034/**
2035 * xmlCharEncInFunc:
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002036 * @handler: char encoding transformation data structure
Owen Taylor3473f882001-02-23 17:55:21 +00002037 * @out: an xmlBuffer for the output.
2038 * @in: an xmlBuffer for the input
2039 *
2040 * Generic front-end for the encoding handler input function
2041 *
2042 * Returns the number of byte written if success, or
2043 * -1 general error
2044 * -2 if the transcoding fails (for *in is not valid utf8 string or
2045 * the result of transformation can't fit into the encoding we want), or
2046 */
2047int
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002048xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2049 xmlBufferPtr in)
2050{
Owen Taylor3473f882001-02-23 17:55:21 +00002051 int ret = -2;
2052 int written;
2053 int toconv;
2054
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002055 if (handler == NULL)
2056 return (-1);
2057 if (out == NULL)
2058 return (-1);
2059 if (in == NULL)
2060 return (-1);
Owen Taylor3473f882001-02-23 17:55:21 +00002061
2062 toconv = in->use;
2063 if (toconv == 0)
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002064 return (0);
Owen Taylor3473f882001-02-23 17:55:21 +00002065 written = out->size - out->use;
2066 if (toconv * 2 >= written) {
2067 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002068 written = out->size - out->use - 1;
Owen Taylor3473f882001-02-23 17:55:21 +00002069 }
2070 if (handler->input != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002071 ret = handler->input(&out->content[out->use], &written,
2072 in->content, &toconv);
2073 xmlBufferShrink(in, toconv);
2074 out->use += written;
2075 out->content[out->use] = 0;
Owen Taylor3473f882001-02-23 17:55:21 +00002076 }
2077#ifdef LIBXML_ICONV_ENABLED
2078 else if (handler->iconv_in != NULL) {
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002079 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2080 &written, in->content, &toconv);
2081 xmlBufferShrink(in, toconv);
2082 out->use += written;
2083 out->content[out->use] = 0;
2084 if (ret == -1)
2085 ret = -3;
Owen Taylor3473f882001-02-23 17:55:21 +00002086 }
2087#endif /* LIBXML_ICONV_ENABLED */
2088 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002089 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002090#ifdef DEBUG_ENCODING
2091 xmlGenericError(xmlGenericErrorContext,
2092 "converted %d bytes to %d bytes of input\n",
2093 toconv, written);
Owen Taylor3473f882001-02-23 17:55:21 +00002094#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002095 break;
2096 case -1:
2097#ifdef DEBUG_ENCODING
2098 xmlGenericError(xmlGenericErrorContext,
2099 "converted %d bytes to %d bytes of input, %d left\n",
2100 toconv, written, in->use);
2101#endif
2102 break;
2103 case -3:
2104#ifdef DEBUG_ENCODING
2105 xmlGenericError(xmlGenericErrorContext,
2106 "converted %d bytes to %d bytes of input, %d left\n",
2107 toconv, written, in->use);
2108#endif
2109 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002110 case -2:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002111 xmlGenericError(xmlGenericErrorContext,
2112 "input conversion failed due to input error\n");
2113 xmlGenericError(xmlGenericErrorContext,
2114 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2115 in->content[0], in->content[1],
2116 in->content[2], in->content[3]);
Owen Taylor3473f882001-02-23 17:55:21 +00002117 }
2118 /*
2119 * Ignore when input buffer is not on a boundary
2120 */
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002121 if (ret == -3)
2122 ret = 0;
Daniel Veillardd076a202002-11-20 13:28:31 +00002123 return (written);
Owen Taylor3473f882001-02-23 17:55:21 +00002124}
2125
2126/**
2127 * xmlCharEncOutFunc:
2128 * @handler: char enconding transformation data structure
2129 * @out: an xmlBuffer for the output.
2130 * @in: an xmlBuffer for the input
2131 *
2132 * Generic front-end for the encoding handler output function
2133 * a first call with @in == NULL has to be made firs to initiate the
2134 * output in case of non-stateless encoding needing to initiate their
2135 * state or the output (like the BOM in UTF16).
2136 * In case of UTF8 sequence conversion errors for the given encoder,
2137 * the content will be automatically remapped to a CharRef sequence.
2138 *
2139 * Returns the number of byte written if success, or
2140 * -1 general error
2141 * -2 if the transcoding fails (for *in is not valid utf8 string or
2142 * the result of transformation can't fit into the encoding we want), or
2143 */
2144int
2145xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2146 xmlBufferPtr in) {
2147 int ret = -2;
2148 int written;
2149 int writtentot = 0;
2150 int toconv;
2151 int output = 0;
2152
2153 if (handler == NULL) return(-1);
2154 if (out == NULL) return(-1);
2155
2156retry:
2157
2158 written = out->size - out->use;
2159
2160 /*
2161 * First specific handling of in = NULL, i.e. the initialization call
2162 */
2163 if (in == NULL) {
2164 toconv = 0;
2165 if (handler->output != NULL) {
2166 ret = handler->output(&out->content[out->use], &written,
2167 NULL, &toconv);
2168 out->use += written;
2169 out->content[out->use] = 0;
2170 }
2171#ifdef LIBXML_ICONV_ENABLED
2172 else if (handler->iconv_out != NULL) {
2173 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2174 &written, NULL, &toconv);
2175 out->use += written;
2176 out->content[out->use] = 0;
2177 }
2178#endif /* LIBXML_ICONV_ENABLED */
2179#ifdef DEBUG_ENCODING
2180 xmlGenericError(xmlGenericErrorContext,
2181 "initialized encoder\n");
2182#endif
2183 return(0);
2184 }
2185
2186 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002187 * Conversion itself.
Owen Taylor3473f882001-02-23 17:55:21 +00002188 */
2189 toconv = in->use;
2190 if (toconv == 0)
2191 return(0);
2192 if (toconv * 2 >= written) {
2193 xmlBufferGrow(out, toconv * 2);
2194 written = out->size - out->use - 1;
2195 }
2196 if (handler->output != NULL) {
2197 ret = handler->output(&out->content[out->use], &written,
2198 in->content, &toconv);
2199 xmlBufferShrink(in, toconv);
2200 out->use += written;
2201 writtentot += written;
2202 out->content[out->use] = 0;
2203 }
2204#ifdef LIBXML_ICONV_ENABLED
2205 else if (handler->iconv_out != NULL) {
2206 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2207 &written, in->content, &toconv);
2208 xmlBufferShrink(in, toconv);
2209 out->use += written;
2210 writtentot += written;
2211 out->content[out->use] = 0;
2212 if (ret == -1) {
2213 if (written > 0) {
2214 /*
2215 * Can be a limitation of iconv
2216 */
2217 goto retry;
2218 }
2219 ret = -3;
2220 }
2221 }
2222#endif /* LIBXML_ICONV_ENABLED */
2223 else {
2224 xmlGenericError(xmlGenericErrorContext,
2225 "xmlCharEncOutFunc: no output function !\n");
2226 return(-1);
2227 }
2228
2229 if (ret >= 0) output += ret;
2230
2231 /*
2232 * Attempt to handle error cases
2233 */
2234 switch (ret) {
Owen Taylor3473f882001-02-23 17:55:21 +00002235 case 0:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002236#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002237 xmlGenericError(xmlGenericErrorContext,
2238 "converted %d bytes to %d bytes of output\n",
2239 toconv, written);
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002240#endif
Owen Taylor3473f882001-02-23 17:55:21 +00002241 break;
2242 case -1:
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002243#ifdef DEBUG_ENCODING
Owen Taylor3473f882001-02-23 17:55:21 +00002244 xmlGenericError(xmlGenericErrorContext,
2245 "output conversion failed by lack of space\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002246#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002247 break;
Owen Taylor3473f882001-02-23 17:55:21 +00002248 case -3:
2249 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2250 toconv, written, in->use);
2251 break;
2252 case -2: {
2253 int len = in->use;
2254 const xmlChar *utf = (const xmlChar *) in->content;
2255 int cur;
2256
2257 cur = xmlGetUTF8Char(utf, &len);
2258 if (cur > 0) {
2259 xmlChar charref[20];
2260
2261#ifdef DEBUG_ENCODING
2262 xmlGenericError(xmlGenericErrorContext,
2263 "handling output conversion error\n");
2264 xmlGenericError(xmlGenericErrorContext,
2265 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2266 in->content[0], in->content[1],
2267 in->content[2], in->content[3]);
2268#endif
2269 /*
2270 * Removes the UTF8 sequence, and replace it by a charref
2271 * and continue the transcoding phase, hoping the error
2272 * did not mangle the encoder state.
2273 */
Aleksey Sanin49cc9752002-06-14 17:07:10 +00002274 snprintf((char *) charref, sizeof(charref), "&#%d;", cur);
Owen Taylor3473f882001-02-23 17:55:21 +00002275 xmlBufferShrink(in, len);
2276 xmlBufferAddHead(in, charref, -1);
2277
2278 goto retry;
2279 } else {
2280 xmlGenericError(xmlGenericErrorContext,
2281 "output conversion failed due to conv error\n");
2282 xmlGenericError(xmlGenericErrorContext,
2283 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2284 in->content[0], in->content[1],
2285 in->content[2], in->content[3]);
2286 in->content[0] = ' ';
2287 }
2288 break;
2289 }
2290 }
2291 return(ret);
2292}
2293
2294/**
2295 * xmlCharEncCloseFunc:
2296 * @handler: char enconding transformation data structure
2297 *
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002298 * Generic front-end for encoding handler close function
Owen Taylor3473f882001-02-23 17:55:21 +00002299 *
2300 * Returns 0 if success, or -1 in case of error
2301 */
2302int
2303xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2304 int ret = 0;
2305 if (handler == NULL) return(-1);
2306 if (handler->name == NULL) return(-1);
2307#ifdef LIBXML_ICONV_ENABLED
2308 /*
Daniel Veillardcbaf3992001-12-31 16:16:02 +00002309 * Iconv handlers can be used only once, free the whole block.
Owen Taylor3473f882001-02-23 17:55:21 +00002310 * and the associated icon resources.
2311 */
2312 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2313 if (handler->name != NULL)
2314 xmlFree(handler->name);
2315 handler->name = NULL;
2316 if (handler->iconv_out != NULL) {
2317 if (iconv_close(handler->iconv_out))
2318 ret = -1;
2319 handler->iconv_out = NULL;
2320 }
2321 if (handler->iconv_in != NULL) {
2322 if (iconv_close(handler->iconv_in))
2323 ret = -1;
2324 handler->iconv_in = NULL;
2325 }
2326 xmlFree(handler);
2327 }
2328#endif /* LIBXML_ICONV_ENABLED */
2329#ifdef DEBUG_ENCODING
2330 if (ret)
2331 xmlGenericError(xmlGenericErrorContext,
2332 "failed to close the encoding handler\n");
2333 else
2334 xmlGenericError(xmlGenericErrorContext,
2335 "closed the encoding handler\n");
Owen Taylor3473f882001-02-23 17:55:21 +00002336#endif
Daniel Veillardd79bcd12001-06-21 22:07:42 +00002337
Owen Taylor3473f882001-02-23 17:55:21 +00002338 return(ret);
2339}
2340