blob: 3031ce8c8b99c412b1f7fa7af1b3f4cbb151c837 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
Daniel Veillardbe803962000-06-28 23:40:59 +00006 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
Daniel Veillard891e4041998-10-19 00:43:02 +00007 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Daniel Veillard14fff061999-06-22 21:49:07 +000016 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard891e4041998-10-19 00:43:02 +000017 *
18 * See Copyright for the status of this software.
19 *
Daniel Veillard891e4041998-10-19 00:43:02 +000020 * Daniel.Veillard@w3.org
21 */
22
Daniel Veillard3c558c31999-12-22 11:30:41 +000023#ifdef WIN32
24#include "win32config.h"
25#else
Daniel Veillardb96e6431999-08-29 21:02:19 +000026#include "config.h"
Daniel Veillard7f7d1111999-09-22 09:46:25 +000027#endif
28
Daniel Veillard14fff061999-06-22 21:49:07 +000029#include <stdio.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000030#include <string.h>
31
32#ifdef HAVE_CTYPE_H
33#include <ctype.h>
34#endif
Daniel Veillard6d3bf1f1999-12-16 17:52:19 +000035#ifdef HAVE_STDLIB_H
36#include <stdlib.h>
37#endif
Daniel Veillard496a1cf2000-05-03 14:20:55 +000038#include <libxml/xmlversion.h>
39#ifdef LIBXML_ICONV_ENABLED
40#ifdef HAVE_ERRNO_H
41#include <errno.h>
42#endif
43#endif
Daniel Veillard361d8452000-04-03 19:48:13 +000044#include <libxml/encoding.h>
45#include <libxml/xmlmemory.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000046#ifdef LIBXML_HTML_ENABLED
47#include <libxml/HTMLparser.h>
48#endif
Daniel Veillard891e4041998-10-19 00:43:02 +000049
Daniel Veillardcf461992000-03-14 18:30:20 +000050xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Daniel Veillardb05deb71999-08-10 19:04:08 +000052
Daniel Veillard496a1cf2000-05-03 14:20:55 +000053#ifdef LIBXML_ICONV_ENABLED
54#if 0
55#define DEBUG_ENCODING /* Define this to get encoding traces */
56#endif
57#endif
58
59static int xmlLittleEndian = 1;
60
Daniel Veillard0ba4d531998-11-01 19:34:31 +000061/*
62 * From rfc2044: encoding of the Unicode values on UTF-8:
63 *
64 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
65 * 0000 0000-0000 007F 0xxxxxxx
66 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
67 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
68 *
69 * I hope we won't use values > 0xFFFF anytime soon !
70 */
71
Daniel Veillard97b58771998-10-20 06:14:16 +000072/**
Daniel Veillardbe803962000-06-28 23:40:59 +000073 * xmlGetUTF8Char:
74 * @utf: a sequence of UTF-8 encoded bytes
75 * @len: a pointer to @bytes len
76 *
77 * Read one UTF8 Char from @utf
78 *
79 * Returns the char value or -1 in case of error and update @len with the
80 * number of bytes used
81 */
82int
83xmlGetUTF8Char(const unsigned char *utf, int *len) {
84 unsigned int c;
85
86 if (utf == NULL)
87 goto error;
88 if (len == NULL)
89 goto error;
90 if (*len < 1)
91 goto error;
92
93 c = utf[0];
94 if (c & 0x80) {
95 if (*len < 2)
96 goto error;
97 if ((utf[1] & 0xc0) != 0x80)
98 goto error;
99 if ((c & 0xe0) == 0xe0) {
100 if (*len < 3)
101 goto error;
102 if ((utf[2] & 0xc0) != 0x80)
103 goto error;
104 if ((c & 0xf0) == 0xf0) {
105 if (*len < 4)
106 goto error;
107 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
108 goto error;
109 *len = 4;
110 /* 4-byte code */
111 c = (utf[0] & 0x7) << 18;
112 c |= (utf[1] & 0x3f) << 12;
113 c |= (utf[2] & 0x3f) << 6;
114 c |= utf[3] & 0x3f;
115 } else {
116 /* 3-byte code */
117 *len = 3;
118 c = (utf[0] & 0xf) << 12;
119 c |= (utf[1] & 0x3f) << 6;
120 c |= utf[2] & 0x3f;
121 }
122 } else {
123 /* 2-byte code */
124 *len = 2;
125 c = (utf[0] & 0x1f) << 6;
126 c |= utf[1] & 0x3f;
127 }
128 } else {
129 /* 1-byte code */
130 *len = 1;
131 }
132 return(c);
133
134error:
135 *len = 0;
136 return(-1);
137}
138
139/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000140 * xmlCheckUTF8: Check utf-8 string for legality.
141 * @utf: Pointer to putative utf-8 encoded string.
142 *
143 * Checks @utf for being valid utf-8. @utf is assumed to be
144 * null-terminated. This function is not super-strict, as it will
145 * allow longer utf-8 sequences than necessary. Note that Java is
146 * capable of producing these sequences if provoked. Also note, this
147 * routine checks for the 4-byte maxiumum size, but does not check for
148 * 0x10ffff maximum value.
149 *
150 * Return value: true if @utf is valid.
151 **/
152int
153xmlCheckUTF8(const unsigned char *utf)
154{
155 int ix;
156 unsigned char c;
157
158 for (ix = 0; (c = utf[ix]);) {
159 if (c & 0x80) {
160 if ((utf[ix + 1] & 0xc0) != 0x80)
161 return(0);
162 if ((c & 0xe0) == 0xe0) {
163 if ((utf[ix + 2] & 0xc0) != 0x80)
164 return(0);
165 if ((c & 0xf0) == 0xf0) {
166 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
167 return(0);
168 ix += 4;
169 /* 4-byte code */
170 } else
171 /* 3-byte code */
172 ix += 3;
173 } else
174 /* 2-byte code */
175 ix += 2;
176 } else
177 /* 1-byte code */
178 ix++;
179 }
180 return(1);
181}
182
183/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000184 * asciiToUTF8:
185 * @out: a pointer to an array of bytes to store the result
186 * @outlen: the length of @out
187 * @in: a pointer to an array of ASCII chars
188 * @inlen: the length of @in
189 *
190 * Take a block of ASCII chars in and try to convert it to an UTF-8
191 * block of chars out.
192 * Returns 0 if success, or -1 otherwise
193 * The value of @inlen after return is the number of octets consumed
194 * as the return value is positive, else unpredictiable.
195 * The value of @outlen after return is the number of ocetes consumed.
196 */
197int
198asciiToUTF8(unsigned char* out, int *outlen,
199 const unsigned char* in, int *inlen) {
200 unsigned char* outstart = out;
201 const unsigned char* base = in;
202 const unsigned char* processed = in;
203 unsigned char* outend = out + *outlen;
204 const unsigned char* inend;
205 unsigned int c;
206 int bits;
207
208 inend = in + (*inlen);
209 while ((in < inend) && (out - outstart + 5 < *outlen)) {
210 c= *in++;
211
212 /* assertion: c is a single UTF-4 value */
213 if (out >= outend)
214 break;
215 if (c < 0x80) { *out++= c; bits= -6; }
216 else {
217 *outlen = out - outstart;
218 *inlen = processed - base;
219 return(-1);
220 }
221
222 for ( ; bits >= 0; bits-= 6) {
223 if (out >= outend)
224 break;
225 *out++= ((c >> bits) & 0x3F) | 0x80;
226 }
227 processed = (const unsigned char*) in;
228 }
229 *outlen = out - outstart;
230 *inlen = processed - base;
231 return(0);
232}
233
234/**
235 * UTF8Toascii:
236 * @out: a pointer to an array of bytes to store the result
237 * @outlen: the length of @out
238 * @in: a pointer to an array of UTF-8 chars
239 * @inlen: the length of @in
240 *
241 * Take a block of UTF-8 chars in and try to convert it to an ASCII
242 * block of chars out.
243 *
244 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
245 * The value of @inlen after return is the number of octets consumed
246 * as the return value is positive, else unpredictiable.
247 * The value of @outlen after return is the number of ocetes consumed.
248 */
249int
250UTF8Toascii(unsigned char* out, int *outlen,
251 const unsigned char* in, int *inlen) {
252 const unsigned char* processed = in;
253 const unsigned char* outend;
254 const unsigned char* outstart = out;
255 const unsigned char* instart = in;
256 const unsigned char* inend;
257 unsigned int c, d;
258 int trailing;
259
260 if (in == NULL) {
261 /*
262 * initialization nothing to do
263 */
264 *outlen = 0;
265 *inlen = 0;
266 return(0);
267 }
268 inend = in + (*inlen);
269 outend = out + (*outlen);
270 while (in < inend) {
271 d = *in++;
272 if (d < 0x80) { c= d; trailing= 0; }
273 else if (d < 0xC0) {
274 /* trailing byte in leading position */
275 *outlen = out - outstart;
276 *inlen = processed - instart;
277 return(-2);
278 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
279 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
280 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
281 else {
282 /* no chance for this in Ascii */
283 *outlen = out - outstart;
284 *inlen = processed - instart;
285 return(-2);
286 }
287
288 if (inend - in < trailing) {
289 break;
290 }
291
292 for ( ; trailing; trailing--) {
293 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
294 break;
295 c <<= 6;
296 c |= d & 0x3F;
297 }
298
299 /* assertion: c is a single UTF-4 value */
300 if (c < 0x80) {
301 if (out >= outend)
302 break;
303 *out++ = c;
304 } else {
305 /* no chance for this in Ascii */
306 *outlen = out - outstart;
307 *inlen = processed - instart;
308 return(-2);
309 }
310 processed = in;
311 }
312 *outlen = out - outstart;
313 *inlen = processed - instart;
314 return(0);
315}
316
317/**
Daniel Veillard97b58771998-10-20 06:14:16 +0000318 * isolat1ToUTF8:
Daniel Veillard7f858501999-11-17 17:32:38 +0000319 * @out: a pointer to an array of bytes to store the result
320 * @outlen: the length of @out
321 * @in: a pointer to an array of ISO Latin 1 chars
322 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000323 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000324 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
325 * block of chars out.
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000326 * Returns 0 if success, or -1 otherwise
327 * The value of @inlen after return is the number of octets consumed
328 * as the return value is positive, else unpredictiable.
329 * The value of @outlen after return is the number of ocetes consumed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000330 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000331int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000332isolat1ToUTF8(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000333 const unsigned char* in, int *inlen) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000334 unsigned char* outstart = out;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000335 const unsigned char* base = in;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000336 const unsigned char* processed = in;
337 unsigned char* outend = out + *outlen;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000338 const unsigned char* inend;
339 unsigned int c;
340 int bits;
Daniel Veillard891e4041998-10-19 00:43:02 +0000341
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000342 inend = in + (*inlen);
343 while ((in < inend) && (out - outstart + 5 < *outlen)) {
344 c= *in++;
345
346 /* assertion: c is a single UTF-4 value */
347 if (out >= outend)
348 break;
349 if (c < 0x80) { *out++= c; bits= -6; }
350 else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
351
352 for ( ; bits >= 0; bits-= 6) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000353 if (out >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000354 break;
355 *out++= ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard891e4041998-10-19 00:43:02 +0000356 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000357 processed = (const unsigned char*) in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000358 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000359 *outlen = out - outstart;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000360 *inlen = processed - base;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000361 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000362}
363
Daniel Veillard97b58771998-10-20 06:14:16 +0000364/**
365 * UTF8Toisolat1:
Daniel Veillard7f858501999-11-17 17:32:38 +0000366 * @out: a pointer to an array of bytes to store the result
367 * @outlen: the length of @out
368 * @in: a pointer to an array of UTF-8 chars
369 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000370 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000371 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
372 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000373 *
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000374 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
Daniel Veillardcf461992000-03-14 18:30:20 +0000375 * The value of @inlen after return is the number of octets consumed
376 * as the return value is positive, else unpredictiable.
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000377 * The value of @outlen after return is the number of ocetes consumed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000378 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000379int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000380UTF8Toisolat1(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000381 const unsigned char* in, int *inlen) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000382 const unsigned char* processed = in;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000383 const unsigned char* outend;
384 const unsigned char* outstart = out;
385 const unsigned char* instart = in;
386 const unsigned char* inend;
387 unsigned int c, d;
388 int trailing;
Daniel Veillard891e4041998-10-19 00:43:02 +0000389
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000390 if (in == NULL) {
391 /*
392 * initialization nothing to do
393 */
394 *outlen = 0;
395 *inlen = 0;
396 return(0);
397 }
398 inend = in + (*inlen);
399 outend = out + (*outlen);
Daniel Veillard891e4041998-10-19 00:43:02 +0000400 while (in < inend) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000401 d = *in++;
402 if (d < 0x80) { c= d; trailing= 0; }
403 else if (d < 0xC0) {
404 /* trailing byte in leading position */
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000405 *outlen = out - outstart;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000406 *inlen = processed - instart;
407 return(-2);
408 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
409 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
410 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
411 else {
412 /* no chance for this in IsoLat1 */
413 *outlen = out - outstart;
414 *inlen = processed - instart;
415 return(-2);
416 }
417
418 if (inend - in < trailing) {
419 break;
420 }
421
422 for ( ; trailing; trailing--) {
Daniel Veillard87b95392000-08-12 21:12:04 +0000423 if (in >= inend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000424 break;
Daniel Veillard87b95392000-08-12 21:12:04 +0000425 if (((d= *in++) & 0xC0) != 0x80) {
426 *outlen = out - outstart;
427 *inlen = processed - instart;
428 return(-2);
429 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000430 c <<= 6;
431 c |= d & 0x3F;
432 }
433
434 /* assertion: c is a single UTF-4 value */
435 if (c <= 0xFF) {
436 if (out >= outend)
437 break;
438 *out++ = c;
439 } else {
440 /* no chance for this in IsoLat1 */
441 *outlen = out - outstart;
442 *inlen = processed - instart;
Daniel Veillardcf461992000-03-14 18:30:20 +0000443 return(-2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000444 }
445 processed = in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000446 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000447 *outlen = out - outstart;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000448 *inlen = processed - instart;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000449 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000450}
451
Daniel Veillard97b58771998-10-20 06:14:16 +0000452/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000453 * UTF16LEToUTF8:
Daniel Veillard7f858501999-11-17 17:32:38 +0000454 * @out: a pointer to an array of bytes to store the result
455 * @outlen: the length of @out
Daniel Veillardcf461992000-03-14 18:30:20 +0000456 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
457 * @inlenb: the length of @in in UTF-16LE chars
Daniel Veillard97b58771998-10-20 06:14:16 +0000458 *
Daniel Veillardcf461992000-03-14 18:30:20 +0000459 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
460 * block of chars out. This function assume the endian properity
461 * is the same between the native type of this machine and the
462 * inputed one.
463 *
464 * Returns the number of byte written, or -1 by lack of space, or -2
465 * if the transcoding fails (for *in is not valid utf16 string)
466 * The value of *inlen after return is the number of octets consumed
467 * as the return value is positive, else unpredictiable.
Daniel Veillard891e4041998-10-19 00:43:02 +0000468 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000469int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000470UTF16LEToUTF8(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000471 const unsigned char* inb, int *inlenb)
Daniel Veillard891e4041998-10-19 00:43:02 +0000472{
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000473 unsigned char* outstart = out;
474 const unsigned char* processed = inb;
475 unsigned char* outend = out + *outlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000476 unsigned short* in = (unsigned short*) inb;
477 unsigned short* inend;
478 unsigned int c, d, inlen;
479 unsigned char *tmp;
Daniel Veillard891e4041998-10-19 00:43:02 +0000480 int bits;
481
Daniel Veillardcf461992000-03-14 18:30:20 +0000482 if ((*inlenb % 2) == 1)
483 (*inlenb)--;
484 inlen = *inlenb / 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000485 inend = in + inlen;
Daniel Veillardbe803962000-06-28 23:40:59 +0000486 while ((in < inend) && (out - outstart + 5 < *outlen)) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000487 if (xmlLittleEndian) {
488 c= *in++;
489 } else {
490 tmp = (unsigned char *) in;
491 c = *tmp++;
492 c = c | (((unsigned int)*tmp) << 8);
493 in++;
494 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000495 if ((c & 0xFC00) == 0xD800) { /* surrogates */
Daniel Veillardbe803962000-06-28 23:40:59 +0000496 if (in >= inend) { /* (in > inend) shouldn't happens */
497 break;
498 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000499 if (xmlLittleEndian) {
500 d = *in++;
501 } else {
502 tmp = (unsigned char *) in;
503 d = *tmp++;
504 d = d | (((unsigned int)*tmp) << 8);
505 in++;
506 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000507 if ((d & 0xFC00) == 0xDC00) {
Daniel Veillard891e4041998-10-19 00:43:02 +0000508 c &= 0x03FF;
509 c <<= 10;
510 c |= d & 0x03FF;
511 c += 0x10000;
512 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000513 else {
514 *outlen = out - outstart;
515 *inlenb = processed - inb;
Daniel Veillardcf461992000-03-14 18:30:20 +0000516 return(-2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000517 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000518 }
519
Daniel Veillardcf461992000-03-14 18:30:20 +0000520 /* assertion: c is a single UTF-4 value */
521 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000522 break;
Daniel Veillard891e4041998-10-19 00:43:02 +0000523 if (c < 0x80) { *out++= c; bits= -6; }
Daniel Veillardcf461992000-03-14 18:30:20 +0000524 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
525 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
526 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillard891e4041998-10-19 00:43:02 +0000527
Daniel Veillardcf461992000-03-14 18:30:20 +0000528 for ( ; bits >= 0; bits-= 6) {
529 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000530 break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000531 *out++= ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard891e4041998-10-19 00:43:02 +0000532 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000533 processed = (const unsigned char*) in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000534 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000535 *outlen = out - outstart;
536 *inlenb = processed - inb;
537 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000538}
539
Daniel Veillard97b58771998-10-20 06:14:16 +0000540/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000541 * UTF8ToUTF16LE:
542 * @outb: a pointer to an array of bytes to store the result
543 * @outlen: the length of @outb
Daniel Veillard7f858501999-11-17 17:32:38 +0000544 * @in: a pointer to an array of UTF-8 chars
545 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000546 *
Daniel Veillardcf461992000-03-14 18:30:20 +0000547 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
Daniel Veillard891e4041998-10-19 00:43:02 +0000548 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000549 *
Daniel Veillard1e346af1999-02-22 10:33:01 +0000550 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillardcf461992000-03-14 18:30:20 +0000551 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000552 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000553int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000554UTF8ToUTF16LE(unsigned char* outb, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000555 const unsigned char* in, int *inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000556{
Daniel Veillardcf461992000-03-14 18:30:20 +0000557 unsigned short* out = (unsigned short*) outb;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000558 const unsigned char* processed = in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000559 unsigned short* outstart= out;
Daniel Veillardcf461992000-03-14 18:30:20 +0000560 unsigned short* outend;
561 const unsigned char* inend= in+*inlen;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000562 unsigned int c, d;
563 int trailing;
Daniel Veillardcf461992000-03-14 18:30:20 +0000564 unsigned char *tmp;
565 unsigned short tmp1, tmp2;
Daniel Veillard891e4041998-10-19 00:43:02 +0000566
Daniel Veillardbe803962000-06-28 23:40:59 +0000567 if (in == NULL) {
568 /*
569 * initialization, add the Byte Order Mark
570 */
571 if (*outlen >= 2) {
572 outb[0] = 0xFF;
573 outb[1] = 0xFE;
574 *outlen = 2;
575 *inlen = 0;
576#ifdef DEBUG_ENCODING
577 fprintf(stderr, "Added FFFE Byte Order Mark\n");
578#endif
579 return(2);
580 }
581 *outlen = 0;
582 *inlen = 0;
583 return(0);
584 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000585 outend = out + (*outlen / 2);
Daniel Veillard891e4041998-10-19 00:43:02 +0000586 while (in < inend) {
587 d= *in++;
588 if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000589 else if (d < 0xC0) {
590 /* trailing byte in leading position */
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000591 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000592 *inlen = processed - in;
593 return(-2);
594 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
Daniel Veillard891e4041998-10-19 00:43:02 +0000595 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
596 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000597 else {
598 /* no chance for this in UTF-16 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000599 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000600 *inlen = processed - in;
601 return(-2);
602 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000603
604 if (inend - in < trailing) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000605 break;
606 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000607
608 for ( ; trailing; trailing--) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000609 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000610 break;
Daniel Veillard891e4041998-10-19 00:43:02 +0000611 c <<= 6;
612 c |= d & 0x3F;
613 }
614
615 /* assertion: c is a single UTF-4 value */
616 if (c < 0x10000) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000617 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000618 break;
619 if (xmlLittleEndian) {
620 *out++ = c;
621 } else {
622 tmp = (unsigned char *) out;
623 *tmp = c ;
624 *(tmp + 1) = c >> 8 ;
625 out++;
626 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000627 }
628 else if (c < 0x110000) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000629 if (out+1 >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000630 break;
Daniel Veillard891e4041998-10-19 00:43:02 +0000631 c -= 0x10000;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000632 if (xmlLittleEndian) {
633 *out++ = 0xD800 | (c >> 10);
634 *out++ = 0xDC00 | (c & 0x03FF);
635 } else {
636 tmp1 = 0xD800 | (c >> 10);
637 tmp = (unsigned char *) out;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000638 *tmp = (unsigned char) tmp1;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000639 *(tmp + 1) = tmp1 >> 8;
640 out++;
Daniel Veillardcf461992000-03-14 18:30:20 +0000641
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000642 tmp2 = 0xDC00 | (c & 0x03FF);
643 tmp = (unsigned char *) out;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000644 *tmp = (unsigned char) tmp2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000645 *(tmp + 1) = tmp2 >> 8;
646 out++;
647 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000648 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000649 else
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000650 break;
651 processed = in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000652 }
Daniel Veillardbe803962000-06-28 23:40:59 +0000653 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000654 *inlen = processed - in;
655 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000656}
657
Daniel Veillardcf461992000-03-14 18:30:20 +0000658/**
659 * UTF16BEToUTF8:
660 * @out: a pointer to an array of bytes to store the result
661 * @outlen: the length of @out
662 * @inb: a pointer to an array of UTF-16 passwd as a byte array
663 * @inlenb: the length of @in in UTF-16 chars
664 *
665 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
666 * block of chars out. This function assume the endian properity
667 * is the same between the native type of this machine and the
668 * inputed one.
669 *
670 * Returns the number of byte written, or -1 by lack of space, or -2
671 * if the transcoding fails (for *in is not valid utf16 string)
672 * The value of *inlen after return is the number of octets consumed
673 * as the return value is positive, else unpredictiable.
674 */
675int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000676UTF16BEToUTF8(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000677 const unsigned char* inb, int *inlenb)
678{
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000679 unsigned char* outstart = out;
680 const unsigned char* processed = inb;
681 unsigned char* outend = out + *outlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000682 unsigned short* in = (unsigned short*) inb;
683 unsigned short* inend;
684 unsigned int c, d, inlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000685 unsigned char *tmp;
Daniel Veillardcf461992000-03-14 18:30:20 +0000686 int bits;
687
688 if ((*inlenb % 2) == 1)
689 (*inlenb)--;
690 inlen = *inlenb / 2;
691 inend= in + inlen;
692 while (in < inend) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000693 if (xmlLittleEndian) {
694 tmp = (unsigned char *) in;
695 c = *tmp++;
696 c = c << 8;
697 c = c | (unsigned int) *tmp;
698 in++;
699 } else {
700 c= *in++;
701 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000702 if ((c & 0xFC00) == 0xD800) { /* surrogates */
703 if (in >= inend) { /* (in > inend) shouldn't happens */
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000704 *outlen = out - outstart;
705 *inlenb = processed - inb;
706 return(-2);
Daniel Veillardcf461992000-03-14 18:30:20 +0000707 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000708 if (xmlLittleEndian) {
709 tmp = (unsigned char *) in;
710 d = *tmp++;
711 d = d << 8;
712 d = d | (unsigned int) *tmp;
713 in++;
714 } else {
715 d= *in++;
716 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000717 if ((d & 0xFC00) == 0xDC00) {
718 c &= 0x03FF;
719 c <<= 10;
720 c |= d & 0x03FF;
721 c += 0x10000;
722 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000723 else {
724 *outlen = out - outstart;
725 *inlenb = processed - inb;
Daniel Veillardcf461992000-03-14 18:30:20 +0000726 return(-2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000727 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000728 }
729
730 /* assertion: c is a single UTF-4 value */
731 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000732 break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000733 if (c < 0x80) { *out++= c; bits= -6; }
734 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
735 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
736 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
737
738 for ( ; bits >= 0; bits-= 6) {
739 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000740 break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000741 *out++= ((c >> bits) & 0x3F) | 0x80;
742 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000743 processed = (const unsigned char*) in;
Daniel Veillardcf461992000-03-14 18:30:20 +0000744 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000745 *outlen = out - outstart;
746 *inlenb = processed - inb;
747 return(0);
Daniel Veillardcf461992000-03-14 18:30:20 +0000748}
749
750/**
751 * UTF8ToUTF16BE:
752 * @outb: a pointer to an array of bytes to store the result
753 * @outlen: the length of @outb
754 * @in: a pointer to an array of UTF-8 chars
755 * @inlen: the length of @in
756 *
757 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
758 * block of chars out.
Daniel Veillardcf461992000-03-14 18:30:20 +0000759 *
760 * Returns the number of byte written, or -1 by lack of space, or -2
761 * if the transcoding failed.
762 */
763int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000764UTF8ToUTF16BE(unsigned char* outb, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000765 const unsigned char* in, int *inlen)
766{
767 unsigned short* out = (unsigned short*) outb;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000768 const unsigned char* processed = in;
Daniel Veillardcf461992000-03-14 18:30:20 +0000769 unsigned short* outstart= out;
770 unsigned short* outend;
771 const unsigned char* inend= in+*inlen;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000772 unsigned int c, d;
773 int trailing;
Daniel Veillardcf461992000-03-14 18:30:20 +0000774 unsigned char *tmp;
775 unsigned short tmp1, tmp2;
Daniel Veillardcf461992000-03-14 18:30:20 +0000776
Daniel Veillardbe803962000-06-28 23:40:59 +0000777 if (in == NULL) {
778 /*
779 * initialization, add the Byte Order Mark
780 */
781 if (*outlen >= 2) {
782 outb[0] = 0xFE;
783 outb[1] = 0xFF;
784 *outlen = 2;
785 *inlen = 0;
786#ifdef DEBUG_ENCODING
787 fprintf(stderr, "Added FEFF Byte Order Mark\n");
788#endif
789 return(2);
790 }
791 *outlen = 0;
792 *inlen = 0;
793 return(0);
794 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000795 outend = out + (*outlen / 2);
Daniel Veillardcf461992000-03-14 18:30:20 +0000796 while (in < inend) {
797 d= *in++;
798 if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000799 else if (d < 0xC0) {
800 /* trailing byte in leading position */
801 *outlen = out - outstart;
802 *inlen = processed - in;
803 return(-2);
804 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
Daniel Veillardcf461992000-03-14 18:30:20 +0000805 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
806 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000807 else {
808 /* no chance for this in UTF-16 */
809 *outlen = out - outstart;
810 *inlen = processed - in;
811 return(-2);
812 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000813
814 if (inend - in < trailing) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000815 break;
816 }
817
818 for ( ; trailing; trailing--) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000819 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000820 c <<= 6;
821 c |= d & 0x3F;
822 }
823
824 /* assertion: c is a single UTF-4 value */
825 if (c < 0x10000) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000826 if (out >= outend) break;
827 if (xmlLittleEndian) {
828 tmp = (unsigned char *) out;
829 *tmp = c >> 8;
830 *(tmp + 1) = c;
831 out++;
832 } else {
833 *out++ = c;
834 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000835 }
836 else if (c < 0x110000) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000837 if (out+1 >= outend) break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000838 c -= 0x10000;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000839 if (xmlLittleEndian) {
840 tmp1 = 0xD800 | (c >> 10);
841 tmp = (unsigned char *) out;
842 *tmp = tmp1 >> 8;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000843 *(tmp + 1) = (unsigned char) tmp1;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000844 out++;
Daniel Veillardcf461992000-03-14 18:30:20 +0000845
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000846 tmp2 = 0xDC00 | (c & 0x03FF);
847 tmp = (unsigned char *) out;
848 *tmp = tmp2 >> 8;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000849 *(tmp + 1) = (unsigned char) tmp2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000850 out++;
851 } else {
852 *out++ = 0xD800 | (c >> 10);
853 *out++ = 0xDC00 | (c & 0x03FF);
854 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000855 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000856 else
857 break;
858 processed = in;
Daniel Veillardcf461992000-03-14 18:30:20 +0000859 }
Daniel Veillardbe803962000-06-28 23:40:59 +0000860 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000861 *inlen = processed - in;
862 return(0);
Daniel Veillardcf461992000-03-14 18:30:20 +0000863}
Daniel Veillard97b58771998-10-20 06:14:16 +0000864
Daniel Veillard27d88741999-05-29 11:51:49 +0000865/**
866 * xmlDetectCharEncoding:
867 * @in: a pointer to the first bytes of the XML entity, must be at least
868 * 4 bytes long.
Daniel Veillardcf461992000-03-14 18:30:20 +0000869 * @len: pointer to the length of the buffer
Daniel Veillard27d88741999-05-29 11:51:49 +0000870 *
871 * Guess the encoding of the entity using the first bytes of the entity content
872 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
873 *
874 * Returns one of the XML_CHAR_ENCODING_... values.
875 */
876xmlCharEncoding
Daniel Veillardcf461992000-03-14 18:30:20 +0000877xmlDetectCharEncoding(const unsigned char* in, int len)
Daniel Veillard27d88741999-05-29 11:51:49 +0000878{
Daniel Veillardcf461992000-03-14 18:30:20 +0000879 if (len >= 4) {
880 if ((in[0] == 0x00) && (in[1] == 0x00) &&
881 (in[2] == 0x00) && (in[3] == 0x3C))
882 return(XML_CHAR_ENCODING_UCS4BE);
883 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
884 (in[2] == 0x00) && (in[3] == 0x00))
885 return(XML_CHAR_ENCODING_UCS4LE);
886 if ((in[0] == 0x00) && (in[1] == 0x00) &&
887 (in[2] == 0x3C) && (in[3] == 0x00))
888 return(XML_CHAR_ENCODING_UCS4_2143);
889 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
890 (in[2] == 0x00) && (in[3] == 0x00))
891 return(XML_CHAR_ENCODING_UCS4_3412);
892 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
893 (in[2] == 0xA7) && (in[3] == 0x94))
894 return(XML_CHAR_ENCODING_EBCDIC);
895 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
896 (in[2] == 0x78) && (in[3] == 0x6D))
897 return(XML_CHAR_ENCODING_UTF8);
898 }
899 if (len >= 2) {
900 if ((in[0] == 0xFE) && (in[1] == 0xFF))
901 return(XML_CHAR_ENCODING_UTF16BE);
902 if ((in[0] == 0xFF) && (in[1] == 0xFE))
903 return(XML_CHAR_ENCODING_UTF16LE);
904 }
Daniel Veillard27d88741999-05-29 11:51:49 +0000905 return(XML_CHAR_ENCODING_NONE);
906}
907
908/**
909 * xmlParseCharEncoding:
Daniel Veillard7f858501999-11-17 17:32:38 +0000910 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
Daniel Veillard27d88741999-05-29 11:51:49 +0000911 *
912 * Conpare the string to the known encoding schemes already known. Note
913 * that the comparison is case insensitive accordingly to the section
914 * [XML] 4.3.3 Character Encoding in Entities.
915 *
916 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
917 * if not recognized.
918 */
919xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000920xmlParseCharEncoding(const char* name)
Daniel Veillard27d88741999-05-29 11:51:49 +0000921{
922 char upper[500];
923 int i;
924
925 for (i = 0;i < 499;i++) {
926 upper[i] = toupper(name[i]);
927 if (upper[i] == 0) break;
928 }
929 upper[i] = 0;
930
931 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
932 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
933 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
934
935 /*
936 * NOTE: if we were able to parse this, the endianness of UTF16 is
937 * already found and in use
938 */
939 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
940 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
941
942 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
943 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
944 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
945
946 /*
947 * NOTE: if we were able to parse this, the endianness of UCS4 is
948 * already found and in use
949 */
950 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
951 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
952 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
953
954
955 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
956 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
957 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
958
959 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
960 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
961 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
962
963 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
964 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
965 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
966 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
967 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
968 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
969 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
970
971 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000972 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
Daniel Veillard27d88741999-05-29 11:51:49 +0000973 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000974
975#ifdef DEBUG_ENCODING
976 fprintf(stderr, "Unknown encoding %s\n", name);
977#endif
Daniel Veillard27d88741999-05-29 11:51:49 +0000978 return(XML_CHAR_ENCODING_ERROR);
979}
Daniel Veillard14fff061999-06-22 21:49:07 +0000980
Daniel Veillardbe803962000-06-28 23:40:59 +0000981/**
982 * xmlGetCharEncodingName:
983 * @enc: the encoding
984 *
985 * The "canonical" name for XML encoding.
986 * C.f. http://www.w3.org/TR/REC-xml#charencoding
987 * Section 4.3.3 Character Encoding in Entities
988 *
989 * Returns the canonical name for the given encoding
990 */
991
992const char*
993xmlGetCharEncodingName(xmlCharEncoding enc) {
994 switch (enc) {
995 case XML_CHAR_ENCODING_ERROR:
996 return(NULL);
997 case XML_CHAR_ENCODING_NONE:
998 return(NULL);
999 case XML_CHAR_ENCODING_UTF8:
1000 return("UTF-8");
1001 case XML_CHAR_ENCODING_UTF16LE:
1002 return("UTF-16");
1003 case XML_CHAR_ENCODING_UTF16BE:
1004 return("UTF-16");
1005 case XML_CHAR_ENCODING_EBCDIC:
1006 return("EBCDIC");
1007 case XML_CHAR_ENCODING_UCS4LE:
1008 return("ISO-10646-UCS-4");
1009 case XML_CHAR_ENCODING_UCS4BE:
1010 return("ISO-10646-UCS-4");
1011 case XML_CHAR_ENCODING_UCS4_2143:
1012 return("ISO-10646-UCS-4");
1013 case XML_CHAR_ENCODING_UCS4_3412:
1014 return("ISO-10646-UCS-4");
1015 case XML_CHAR_ENCODING_UCS2:
1016 return("ISO-10646-UCS-2");
1017 case XML_CHAR_ENCODING_8859_1:
1018 return("ISO-8859-1");
1019 case XML_CHAR_ENCODING_8859_2:
1020 return("ISO-8859-2");
1021 case XML_CHAR_ENCODING_8859_3:
1022 return("ISO-8859-3");
1023 case XML_CHAR_ENCODING_8859_4:
1024 return("ISO-8859-4");
1025 case XML_CHAR_ENCODING_8859_5:
1026 return("ISO-8859-5");
1027 case XML_CHAR_ENCODING_8859_6:
1028 return("ISO-8859-6");
1029 case XML_CHAR_ENCODING_8859_7:
1030 return("ISO-8859-7");
1031 case XML_CHAR_ENCODING_8859_8:
1032 return("ISO-8859-8");
1033 case XML_CHAR_ENCODING_8859_9:
1034 return("ISO-8859-9");
1035 case XML_CHAR_ENCODING_2022_JP:
1036 return("ISO-2022-JP");
1037 case XML_CHAR_ENCODING_SHIFT_JIS:
1038 return("Shift-JIS");
1039 case XML_CHAR_ENCODING_EUC_JP:
1040 return("EUC-JP");
Daniel Veillard87b95392000-08-12 21:12:04 +00001041 case XML_CHAR_ENCODING_ASCII:
1042 return(NULL);
Daniel Veillardbe803962000-06-28 23:40:59 +00001043 }
1044 return(NULL);
1045}
1046
Daniel Veillard14fff061999-06-22 21:49:07 +00001047/****************************************************************
1048 * *
1049 * Char encoding handlers *
1050 * *
1051 ****************************************************************/
1052
1053/* the size should be growable, but it's not a big deal ... */
1054#define MAX_ENCODING_HANDLERS 50
1055static xmlCharEncodingHandlerPtr *handlers = NULL;
1056static int nbCharEncodingHandler = 0;
1057
1058/*
1059 * The default is UTF-8 for XML, that's also the default used for the
1060 * parser internals, so the default encoding handler is NULL
1061 */
1062
1063static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1064
1065/**
1066 * xmlNewCharEncodingHandler:
Daniel Veillard7f858501999-11-17 17:32:38 +00001067 * @name: the encoding name, in UTF-8 format (ASCII actually)
Daniel Veillard14fff061999-06-22 21:49:07 +00001068 * @input: the xmlCharEncodingInputFunc to read that encoding
1069 * @output: the xmlCharEncodingOutputFunc to write that encoding
1070 *
1071 * Create and registers an xmlCharEncodingHandler.
1072 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1073 */
1074xmlCharEncodingHandlerPtr
Daniel Veillardcf461992000-03-14 18:30:20 +00001075xmlNewCharEncodingHandler(const char *name,
1076 xmlCharEncodingInputFunc input,
Daniel Veillard14fff061999-06-22 21:49:07 +00001077 xmlCharEncodingOutputFunc output) {
1078 xmlCharEncodingHandlerPtr handler;
1079 char upper[500];
1080 int i;
1081 char *up = 0;
1082
1083 /*
1084 * Keep only the uppercase version of the encoding.
1085 */
1086 if (name == NULL) {
1087 fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
1088 return(NULL);
1089 }
1090 for (i = 0;i < 499;i++) {
1091 upper[i] = toupper(name[i]);
1092 if (upper[i] == 0) break;
1093 }
1094 upper[i] = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00001095 up = xmlMemStrdup(upper);
Daniel Veillard14fff061999-06-22 21:49:07 +00001096 if (up == NULL) {
1097 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
1098 return(NULL);
1099 }
1100
1101 /*
1102 * allocate and fill-up an handler block.
1103 */
1104 handler = (xmlCharEncodingHandlerPtr)
Daniel Veillard6454aec1999-09-02 22:04:43 +00001105 xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard14fff061999-06-22 21:49:07 +00001106 if (handler == NULL) {
1107 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
1108 return(NULL);
1109 }
1110 handler->input = input;
1111 handler->output = output;
1112 handler->name = up;
1113
Daniel Veillard87b95392000-08-12 21:12:04 +00001114#ifdef LIBXML_ICONV_ENABLED
1115 handler->iconv_in = NULL;
1116 handler->iconv_out = NULL;
1117#endif /* LIBXML_ICONV_ENABLED */
1118
Daniel Veillard14fff061999-06-22 21:49:07 +00001119 /*
1120 * registers and returns the handler.
1121 */
1122 xmlRegisterCharEncodingHandler(handler);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001123#ifdef DEBUG_ENCODING
1124 fprintf(stderr, "Registered encoding handler for %s\n", name);
1125#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001126 return(handler);
1127}
1128
1129/**
1130 * xmlInitCharEncodingHandlers:
1131 *
1132 * Initialize the char encoding support, it registers the default
1133 * encoding supported.
Daniel Veillard7f858501999-11-17 17:32:38 +00001134 * NOTE: while public, this function usually doesn't need to be called
Daniel Veillard14fff061999-06-22 21:49:07 +00001135 * in normal processing.
1136 */
1137void
1138xmlInitCharEncodingHandlers(void) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001139 unsigned short int tst = 0x1234;
1140 unsigned char *ptr = (unsigned char *) &tst;
1141
Daniel Veillard14fff061999-06-22 21:49:07 +00001142 if (handlers != NULL) return;
1143
1144 handlers = (xmlCharEncodingHandlerPtr *)
Daniel Veillard6454aec1999-09-02 22:04:43 +00001145 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
Daniel Veillard14fff061999-06-22 21:49:07 +00001146
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001147 if (*ptr == 0x12) xmlLittleEndian = 0;
1148 else if (*ptr == 0x34) xmlLittleEndian = 1;
1149 else fprintf(stderr, "Odd problem at endianness detection\n");
1150
Daniel Veillard14fff061999-06-22 21:49:07 +00001151 if (handlers == NULL) {
1152 fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
1153 return;
1154 }
1155 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillardcf461992000-03-14 18:30:20 +00001156 xmlUTF16LEHandler =
1157 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1158 xmlUTF16BEHandler =
1159 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
Daniel Veillard14fff061999-06-22 21:49:07 +00001160 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001161 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1162#ifdef LIBXML_HTML_ENABLED
1163 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1164#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001165}
1166
1167/**
Daniel Veillarda819dac1999-11-24 18:04:22 +00001168 * xmlCleanupCharEncodingHandlers:
1169 *
1170 * Cleanup the memory allocated for the char encoding support, it
1171 * unregisters all the encoding handlers.
1172 */
1173void
1174xmlCleanupCharEncodingHandlers(void) {
1175 if (handlers == NULL) return;
1176
1177 for (;nbCharEncodingHandler > 0;) {
1178 nbCharEncodingHandler--;
1179 if (handlers[nbCharEncodingHandler] != NULL) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001180 if (handlers[nbCharEncodingHandler]->name != NULL)
1181 xmlFree(handlers[nbCharEncodingHandler]->name);
Daniel Veillarda819dac1999-11-24 18:04:22 +00001182 xmlFree(handlers[nbCharEncodingHandler]);
1183 }
1184 }
1185 xmlFree(handlers);
1186 handlers = NULL;
1187 nbCharEncodingHandler = 0;
1188 xmlDefaultCharEncodingHandler = NULL;
1189}
1190
1191/**
Daniel Veillard14fff061999-06-22 21:49:07 +00001192 * xmlRegisterCharEncodingHandler:
1193 * @handler: the xmlCharEncodingHandlerPtr handler block
1194 *
1195 * Register the char encoding handler, surprizing, isn't it ?
1196 */
1197void
1198xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1199 if (handlers == NULL) xmlInitCharEncodingHandlers();
1200 if (handler == NULL) {
1201 fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
1202 return;
1203 }
1204
1205 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1206 fprintf(stderr,
1207 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1208 fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1209 return;
1210 }
1211 handlers[nbCharEncodingHandler++] = handler;
1212}
1213
1214/**
1215 * xmlGetCharEncodingHandler:
1216 * @enc: an xmlCharEncoding value.
1217 *
1218 * Search in the registrered set the handler able to read/write that encoding.
1219 *
1220 * Returns the handler or NULL if not found
1221 */
1222xmlCharEncodingHandlerPtr
1223xmlGetCharEncodingHandler(xmlCharEncoding enc) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001224 xmlCharEncodingHandlerPtr handler;
1225
Daniel Veillard14fff061999-06-22 21:49:07 +00001226 if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillardcf461992000-03-14 18:30:20 +00001227 switch (enc) {
1228 case XML_CHAR_ENCODING_ERROR:
1229 return(NULL);
1230 case XML_CHAR_ENCODING_NONE:
1231 return(NULL);
1232 case XML_CHAR_ENCODING_UTF8:
1233 return(NULL);
1234 case XML_CHAR_ENCODING_UTF16LE:
1235 return(xmlUTF16LEHandler);
1236 case XML_CHAR_ENCODING_UTF16BE:
1237 return(xmlUTF16BEHandler);
1238 case XML_CHAR_ENCODING_EBCDIC:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001239 handler = xmlFindCharEncodingHandler("EBCDIC");
1240 if (handler != NULL) return(handler);
1241 handler = xmlFindCharEncodingHandler("ebcdic");
1242 if (handler != NULL) return(handler);
1243 break;
Daniel Veillardbe803962000-06-28 23:40:59 +00001244 case XML_CHAR_ENCODING_UCS4BE:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001245 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1246 if (handler != NULL) return(handler);
1247 handler = xmlFindCharEncodingHandler("UCS-4");
1248 if (handler != NULL) return(handler);
1249 handler = xmlFindCharEncodingHandler("UCS4");
1250 if (handler != NULL) return(handler);
1251 break;
Daniel Veillardbe803962000-06-28 23:40:59 +00001252 case XML_CHAR_ENCODING_UCS4LE:
1253 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1254 if (handler != NULL) return(handler);
1255 handler = xmlFindCharEncodingHandler("UCS-4");
1256 if (handler != NULL) return(handler);
1257 handler = xmlFindCharEncodingHandler("UCS4");
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001258 if (handler != NULL) return(handler);
1259 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001260 case XML_CHAR_ENCODING_UCS4_2143:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001261 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001262 case XML_CHAR_ENCODING_UCS4_3412:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001263 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001264 case XML_CHAR_ENCODING_UCS2:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001265 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1266 if (handler != NULL) return(handler);
1267 handler = xmlFindCharEncodingHandler("UCS-2");
1268 if (handler != NULL) return(handler);
1269 handler = xmlFindCharEncodingHandler("UCS2");
1270 if (handler != NULL) return(handler);
1271 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001272
1273 /*
1274 * We used to keep ISO Latin encodings native in the
1275 * generated data. This led to so many problems that
1276 * this has been removed. One can still change this
1277 * back by registering no-ops encoders for those
1278 */
Daniel Veillardcf461992000-03-14 18:30:20 +00001279 case XML_CHAR_ENCODING_8859_1:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001280 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1281 if (handler != NULL) return(handler);
1282 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001283 case XML_CHAR_ENCODING_8859_2:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001284 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1285 if (handler != NULL) return(handler);
1286 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001287 case XML_CHAR_ENCODING_8859_3:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001288 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1289 if (handler != NULL) return(handler);
1290 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001291 case XML_CHAR_ENCODING_8859_4:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001292 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1293 if (handler != NULL) return(handler);
1294 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001295 case XML_CHAR_ENCODING_8859_5:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001296 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1297 if (handler != NULL) return(handler);
1298 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001299 case XML_CHAR_ENCODING_8859_6:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001300 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1301 if (handler != NULL) return(handler);
1302 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001303 case XML_CHAR_ENCODING_8859_7:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001304 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1305 if (handler != NULL) return(handler);
1306 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001307 case XML_CHAR_ENCODING_8859_8:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001308 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1309 if (handler != NULL) return(handler);
1310 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001311 case XML_CHAR_ENCODING_8859_9:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001312 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1313 if (handler != NULL) return(handler);
1314 break;
1315
1316
Daniel Veillardcf461992000-03-14 18:30:20 +00001317 case XML_CHAR_ENCODING_2022_JP:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001318 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1319 if (handler != NULL) return(handler);
1320 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001321 case XML_CHAR_ENCODING_SHIFT_JIS:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001322 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1323 if (handler != NULL) return(handler);
1324 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1325 if (handler != NULL) return(handler);
1326 handler = xmlFindCharEncodingHandler("Shift_JIS");
1327 if (handler != NULL) return(handler);
1328 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001329 case XML_CHAR_ENCODING_EUC_JP:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001330 handler = xmlFindCharEncodingHandler("EUC-JP");
1331 if (handler != NULL) return(handler);
1332 break;
1333 default:
1334 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001335 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001336
1337#ifdef DEBUG_ENCODING
1338 fprintf(stderr, "No handler found for encoding %d\n", enc);
1339#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001340 return(NULL);
1341}
1342
1343/**
1344 * xmlGetCharEncodingHandler:
1345 * @enc: a string describing the char encoding.
1346 *
1347 * Search in the registrered set the handler able to read/write that encoding.
1348 *
1349 * Returns the handler or NULL if not found
1350 */
1351xmlCharEncodingHandlerPtr
1352xmlFindCharEncodingHandler(const char *name) {
Daniel Veillardbe803962000-06-28 23:40:59 +00001353 xmlCharEncoding alias;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001354#ifdef LIBXML_ICONV_ENABLED
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001355 xmlCharEncodingHandlerPtr enc;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001356 iconv_t icv_in, icv_out;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001357#endif /* LIBXML_ICONV_ENABLED */
1358 char upper[100];
Daniel Veillard14fff061999-06-22 21:49:07 +00001359 int i;
1360
1361 if (handlers == NULL) xmlInitCharEncodingHandlers();
1362 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1363 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1364
Daniel Veillardbe803962000-06-28 23:40:59 +00001365 /*
1366 * Check first for directly registered encoding names
1367 */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001368 for (i = 0;i < 99;i++) {
Daniel Veillard14fff061999-06-22 21:49:07 +00001369 upper[i] = toupper(name[i]);
1370 if (upper[i] == 0) break;
1371 }
1372 upper[i] = 0;
1373
1374 for (i = 0;i < nbCharEncodingHandler; i++)
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001375 if (!strcmp(upper, handlers[i]->name)) {
1376#ifdef DEBUG_ENCODING
1377 fprintf(stderr, "Found registered handler for encoding %s\n", name);
1378#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001379 return(handlers[i]);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001380 }
Daniel Veillard14fff061999-06-22 21:49:07 +00001381
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001382#ifdef LIBXML_ICONV_ENABLED
1383 /* check whether iconv can handle this */
1384 icv_in = iconv_open("UTF-8", name);
1385 icv_out = iconv_open(name, "UTF-8");
1386 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001387 enc = (xmlCharEncodingHandlerPtr)
1388 xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001389 if (enc == NULL) {
1390 iconv_close(icv_in);
1391 iconv_close(icv_out);
1392 return(NULL);
1393 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00001394 enc->name = xmlMemStrdup(name);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001395 enc->input = NULL;
1396 enc->output = NULL;
1397 enc->iconv_in = icv_in;
1398 enc->iconv_out = icv_out;
1399#ifdef DEBUG_ENCODING
1400 fprintf(stderr, "Found iconv handler for encoding %s\n", name);
1401#endif
1402 return enc;
1403 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1404 fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
1405 }
1406#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillardbe803962000-06-28 23:40:59 +00001407
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001408#ifdef DEBUG_ENCODING
1409 fprintf(stderr, "No handler found for encoding %s\n", name);
1410#endif
Daniel Veillardbe803962000-06-28 23:40:59 +00001411
1412 /*
1413 * Fallback using the canonical names
1414 */
1415 alias = xmlParseCharEncoding(name);
1416 if (alias != XML_CHAR_ENCODING_ERROR) {
1417 const char* canon;
1418 canon = xmlGetCharEncodingName(alias);
1419 if ((canon != NULL) && (strcmp(name, canon))) {
1420 return(xmlFindCharEncodingHandler(canon));
1421 }
1422 }
1423
Daniel Veillard14fff061999-06-22 21:49:07 +00001424 return(NULL);
1425}
1426
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001427#ifdef LIBXML_ICONV_ENABLED
1428/**
1429 * xmlIconvWrapper:
1430 * @cd: iconv converter data structure
1431 * @out: a pointer to an array of bytes to store the result
1432 * @outlen: the length of @out
1433 * @in: a pointer to an array of ISO Latin 1 chars
1434 * @inlen: the length of @in
1435 *
1436 * Returns 0 if success, or
1437 * -1 by lack of space, or
1438 * -2 if the transcoding fails (for *in is not valid utf8 string or
1439 * the result of transformation can't fit into the encoding we want), or
1440 * -3 if there the last byte can't form a single output char.
1441 *
1442 * The value of @inlen after return is the number of octets consumed
1443 * as the return value is positive, else unpredictiable.
1444 * The value of @outlen after return is the number of ocetes consumed.
1445 */
1446static int
1447xmlIconvWrapper(iconv_t cd,
1448 unsigned char *out, int *outlen,
1449 const unsigned char *in, int *inlen) {
1450
1451 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1452 const char *icv_in = (const char *) in;
1453 char *icv_out = (char *) out;
1454 int ret;
1455
1456 ret = iconv(cd,
1457 &icv_in, &icv_inlen,
1458 &icv_out, &icv_outlen);
Daniel Veillardbe803962000-06-28 23:40:59 +00001459 if (in != NULL) {
1460 *inlen -= icv_inlen;
1461 *outlen -= icv_outlen;
1462 } else {
1463 *inlen = 0;
1464 *outlen = 0;
1465 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001466 if (icv_inlen != 0 || ret == (size_t) -1) {
1467#ifdef EILSEQ
1468 if (errno == EILSEQ) {
1469 return -2;
1470 } else
1471#endif
1472#ifdef E2BIG
1473 if (errno == E2BIG) {
1474 return -1;
1475 } else
1476#endif
1477#ifdef EINVAL
1478 if (errno == EINVAL) {
1479 return -3;
1480 }
1481#endif
1482 else {
1483 return -3;
1484 }
1485 }
1486 return 0;
1487}
1488#endif /* LIBXML_ICONV_ENABLED */
1489
1490/**
Daniel Veillardbe803962000-06-28 23:40:59 +00001491 * xmlCharEncFirstLine:
1492 * @handler: char enconding transformation data structure
1493 * @out: an xmlBuffer for the output.
1494 * @in: an xmlBuffer for the input
1495 *
1496 * Front-end for the encoding handler input function, but handle only
1497 * the very first line, i.e. limit itself to 45 chars.
1498 *
1499 * Returns the number of byte written if success, or
1500 * -1 general error
1501 * -2 if the transcoding fails (for *in is not valid utf8 string or
1502 * the result of transformation can't fit into the encoding we want), or
1503 */
1504int
1505xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1506 xmlBufferPtr in) {
1507 int ret = -2;
1508 int written;
1509 int toconv;
1510
1511 if (handler == NULL) return(-1);
1512 if (out == NULL) return(-1);
1513 if (in == NULL) return(-1);
1514
1515 written = out->size - out->use;
1516 toconv = in->use;
1517 if (toconv * 2 >= written) {
1518 xmlBufferGrow(out, toconv);
1519 written = out->size - out->use - 1;
1520 }
1521
1522 /*
1523 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1524 * 45 chars should be sufficient to reach the end of the encoding
1525 * decalration without going too far inside the document content.
1526 */
1527 written = 45;
1528
1529 if (handler->input != NULL) {
1530 ret = handler->input(&out->content[out->use], &written,
1531 in->content, &toconv);
1532 xmlBufferShrink(in, toconv);
1533 out->use += written;
1534 out->content[out->use] = 0;
1535 }
1536#ifdef LIBXML_ICONV_ENABLED
1537 else if (handler->iconv_in != NULL) {
1538 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1539 &written, in->content, &toconv);
1540 xmlBufferShrink(in, toconv);
1541 out->use += written;
1542 out->content[out->use] = 0;
1543 if (ret == -1) ret = -3;
1544 }
1545#endif /* LIBXML_ICONV_ENABLED */
1546#ifdef DEBUG_ENCODING
1547 switch (ret) {
1548 case 0:
1549 fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1550 toconv, written);
1551 break;
1552 case -1:
1553 fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1554 toconv, written, in->use);
1555 break;
1556 case -2:
1557 fprintf(stderr, "input conversion failed due to input error\n");
1558 break;
1559 case -3:
1560 fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1561 toconv, written, in->use);
1562 break;
1563 default:
1564 fprintf(stderr,"Unknown input conversion failed %d\n", ret);
1565 }
1566#endif
1567 /*
1568 * Ignore when input buffer is not on a boundary
1569 */
1570 if (ret == -3) ret = 0;
1571 if (ret == -1) ret = 0;
1572 return(ret);
1573}
1574
1575/**
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001576 * xmlCharEncInFunc:
1577 * @handler: char enconding transformation data structure
1578 * @out: an xmlBuffer for the output.
1579 * @in: an xmlBuffer for the input
1580 *
1581 * Generic front-end for the encoding handler input function
1582 *
1583 * Returns the number of byte written if success, or
1584 * -1 general error
1585 * -2 if the transcoding fails (for *in is not valid utf8 string or
1586 * the result of transformation can't fit into the encoding we want), or
1587 */
1588int
1589xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1590 xmlBufferPtr in) {
1591 int ret = -2;
1592 int written;
1593 int toconv;
1594
1595 if (handler == NULL) return(-1);
1596 if (out == NULL) return(-1);
1597 if (in == NULL) return(-1);
1598
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001599 toconv = in->use;
Daniel Veillard87b95392000-08-12 21:12:04 +00001600 if (toconv == 0)
1601 return(0);
1602 written = out->size - out->use;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001603 if (toconv * 2 >= written) {
1604 xmlBufferGrow(out, toconv * 2);
1605 written = out->size - out->use - 1;
1606 }
1607 if (handler->input != NULL) {
1608 ret = handler->input(&out->content[out->use], &written,
1609 in->content, &toconv);
1610 xmlBufferShrink(in, toconv);
1611 out->use += written;
1612 out->content[out->use] = 0;
1613 }
1614#ifdef LIBXML_ICONV_ENABLED
1615 else if (handler->iconv_in != NULL) {
1616 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1617 &written, in->content, &toconv);
1618 xmlBufferShrink(in, toconv);
1619 out->use += written;
1620 out->content[out->use] = 0;
1621 if (ret == -1) ret = -3;
1622 }
1623#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001624 switch (ret) {
Daniel Veillardbe803962000-06-28 23:40:59 +00001625#ifdef DEBUG_ENCODING
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001626 case 0:
1627 fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1628 toconv, written);
1629 break;
1630 case -1:
1631 fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1632 toconv, written, in->use);
1633 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001634 case -3:
1635 fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1636 toconv, written, in->use);
1637 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001638#endif
Daniel Veillardbe803962000-06-28 23:40:59 +00001639 case -2:
1640 fprintf(stderr, "input conversion failed due to input error\n");
1641 fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1642 in->content[0], in->content[1],
1643 in->content[2], in->content[3]);
1644 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001645 /*
1646 * Ignore when input buffer is not on a boundary
1647 */
1648 if (ret == -3) ret = 0;
1649 return(ret);
1650}
1651
1652/**
1653 * xmlCharEncOutFunc:
1654 * @handler: char enconding transformation data structure
1655 * @out: an xmlBuffer for the output.
1656 * @in: an xmlBuffer for the input
1657 *
1658 * Generic front-end for the encoding handler output function
Daniel Veillardbe803962000-06-28 23:40:59 +00001659 * a first call with @in == NULL has to be made firs to initiate the
1660 * output in case of non-stateless encoding needing to initiate their
1661 * state or the output (like the BOM in UTF16).
1662 * In case of UTF8 sequence conversion errors for the given encoder,
1663 * the content will be automatically remapped to a CharRef sequence.
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001664 *
1665 * Returns the number of byte written if success, or
1666 * -1 general error
1667 * -2 if the transcoding fails (for *in is not valid utf8 string or
1668 * the result of transformation can't fit into the encoding we want), or
1669 */
1670int
1671xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1672 xmlBufferPtr in) {
1673 int ret = -2;
1674 int written;
1675 int toconv;
Daniel Veillardbe803962000-06-28 23:40:59 +00001676 int output = 0;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001677
1678 if (handler == NULL) return(-1);
1679 if (out == NULL) return(-1);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001680
Daniel Veillardbe803962000-06-28 23:40:59 +00001681retry:
1682
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001683 written = out->size - out->use;
Daniel Veillardbe803962000-06-28 23:40:59 +00001684
1685 /*
1686 * First specific handling of in = NULL, i.e. the initialization call
1687 */
1688 if (in == NULL) {
1689 toconv = 0;
1690 if (handler->output != NULL) {
1691 ret = handler->output(&out->content[out->use], &written,
1692 NULL, &toconv);
1693 out->use += written;
1694 out->content[out->use] = 0;
1695 }
1696#ifdef LIBXML_ICONV_ENABLED
1697 else if (handler->iconv_out != NULL) {
1698 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1699 &written, NULL, &toconv);
1700 out->use += written;
1701 out->content[out->use] = 0;
1702 }
1703#endif /* LIBXML_ICONV_ENABLED */
1704#ifdef DEBUG_ENCODING
1705 fprintf(stderr, "initialized encoder\n");
1706#endif
1707 return(0);
1708 }
1709
1710 /*
1711 * Convertion itself.
1712 */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001713 toconv = in->use;
Daniel Veillard87b95392000-08-12 21:12:04 +00001714 if (toconv == 0)
1715 return(0);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001716 if (toconv * 2 >= written) {
1717 xmlBufferGrow(out, toconv * 2);
1718 written = out->size - out->use - 1;
1719 }
1720 if (handler->output != NULL) {
1721 ret = handler->output(&out->content[out->use], &written,
Daniel Veillardbe803962000-06-28 23:40:59 +00001722 in->content, &toconv);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001723 xmlBufferShrink(in, toconv);
1724 out->use += written;
1725 out->content[out->use] = 0;
1726 }
1727#ifdef LIBXML_ICONV_ENABLED
1728 else if (handler->iconv_out != NULL) {
1729 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1730 &written, in->content, &toconv);
1731 xmlBufferShrink(in, toconv);
1732 out->use += written;
1733 out->content[out->use] = 0;
1734 if (ret == -1) ret = -3;
1735 }
1736#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001737 else {
1738 fprintf(stderr, "xmlCharEncOutFunc: no output function !\n");
1739 return(-1);
1740 }
Daniel Veillardbe803962000-06-28 23:40:59 +00001741
1742 if (ret >= 0) output += ret;
1743
1744 /*
1745 * Attempt to handle error cases
1746 */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001747 switch (ret) {
Daniel Veillardbe803962000-06-28 23:40:59 +00001748#ifdef DEBUG_ENCODING
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001749 case 0:
1750 fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1751 toconv, written);
1752 break;
1753 case -1:
1754 fprintf(stderr, "output conversion failed by lack of space\n");
1755 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001756 case -3:
1757 fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
1758 toconv, written, in->use);
1759 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001760#endif
Daniel Veillardbe803962000-06-28 23:40:59 +00001761 case -2: {
1762 int len = in->use;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001763 const xmlChar *utf = (const xmlChar *) in->content;
Daniel Veillardbe803962000-06-28 23:40:59 +00001764 int cur;
1765
1766 cur = xmlGetUTF8Char(utf, &len);
1767 if (cur > 0) {
1768 xmlChar charref[20];
1769
1770#ifdef DEBUG_ENCODING
1771 fprintf(stderr, "handling output conversion error\n");
1772 fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1773 in->content[0], in->content[1],
1774 in->content[2], in->content[3]);
1775#endif
1776 /*
1777 * Removes the UTF8 sequence, and replace it by a charref
1778 * and continue the transcoding phase, hoping the error
1779 * did not mangle the encoder state.
1780 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001781 sprintf((char *) charref, "&#x%X;", cur);
Daniel Veillardbe803962000-06-28 23:40:59 +00001782 xmlBufferShrink(in, len);
1783 xmlBufferAddHead(in, charref, -1);
1784
1785 goto retry;
1786 } else {
1787 fprintf(stderr, "output conversion failed due to conv error\n");
1788 fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1789 in->content[0], in->content[1],
1790 in->content[2], in->content[3]);
Daniel Veillard87b95392000-08-12 21:12:04 +00001791 in->content[0] = ' ';
Daniel Veillardbe803962000-06-28 23:40:59 +00001792 }
1793 break;
1794 }
1795 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001796 return(ret);
1797}
1798
1799/**
1800 * xmlCharEncCloseFunc:
1801 * @handler: char enconding transformation data structure
1802 *
1803 * Generic front-end for hencoding handler close function
1804 *
1805 * Returns 0 if success, or -1 in case of error
1806 */
1807int
1808xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1809 int ret = 0;
1810 if (handler == NULL) return(-1);
1811 if (handler->name == NULL) return(-1);
1812#ifdef LIBXML_ICONV_ENABLED
1813 /*
1814 * Iconv handlers can be oused only once, free the whole block.
1815 * and the associated icon resources.
1816 */
1817 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
1818 if (handler->name != NULL)
1819 xmlFree(handler->name);
1820 handler->name = NULL;
1821 if (handler->iconv_out != NULL) {
1822 if (iconv_close(handler->iconv_out))
1823 ret = -1;
1824 handler->iconv_out = NULL;
1825 }
1826 if (handler->iconv_in != NULL) {
1827 if (iconv_close(handler->iconv_in))
1828 ret = -1;
1829 handler->iconv_in = NULL;
1830 }
1831 xmlFree(handler);
1832 }
1833#endif /* LIBXML_ICONV_ENABLED */
1834#ifdef DEBUG_ENCODING
1835 if (ret)
1836 fprintf(stderr, "failed to close the encoding handler\n");
1837 else
1838 fprintf(stderr, "closed the encoding handler\n");
1839
1840#endif
1841 return(ret);
1842}
1843