blob: fab241e6d2fa98a1b08357c20bd39279bcf0b0a0 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
Daniel Veillardbe803962000-06-28 23:40:59 +00006 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
Daniel Veillard891e4041998-10-19 00:43:02 +00007 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Daniel Veillard14fff061999-06-22 21:49:07 +000016 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard891e4041998-10-19 00:43:02 +000017 *
18 * See Copyright for the status of this software.
19 *
Daniel Veillard891e4041998-10-19 00:43:02 +000020 * Daniel.Veillard@w3.org
21 */
22
Daniel Veillard3c558c31999-12-22 11:30:41 +000023#ifdef WIN32
24#include "win32config.h"
25#else
Daniel Veillardb96e6431999-08-29 21:02:19 +000026#include "config.h"
Daniel Veillard7f7d1111999-09-22 09:46:25 +000027#endif
28
Daniel Veillard14fff061999-06-22 21:49:07 +000029#include <stdio.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000030#include <string.h>
31
32#ifdef HAVE_CTYPE_H
33#include <ctype.h>
34#endif
Daniel Veillard6d3bf1f1999-12-16 17:52:19 +000035#ifdef HAVE_STDLIB_H
36#include <stdlib.h>
37#endif
Daniel Veillard496a1cf2000-05-03 14:20:55 +000038#include <libxml/xmlversion.h>
39#ifdef LIBXML_ICONV_ENABLED
40#ifdef HAVE_ERRNO_H
41#include <errno.h>
42#endif
43#endif
Daniel Veillard361d8452000-04-03 19:48:13 +000044#include <libxml/encoding.h>
45#include <libxml/xmlmemory.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000046#ifdef LIBXML_HTML_ENABLED
47#include <libxml/HTMLparser.h>
48#endif
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +000049#include <libxml/xmlerror.h>
Daniel Veillard891e4041998-10-19 00:43:02 +000050
Daniel Veillardcf461992000-03-14 18:30:20 +000051xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
52xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Daniel Veillardb05deb71999-08-10 19:04:08 +000053
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +000054typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
55typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
56struct _xmlCharEncodingAlias {
57 const char *name;
58 const char *alias;
59};
60
61static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
62static int xmlCharEncodingAliasesNb = 0;
63static int xmlCharEncodingAliasesMax = 0;
64
Daniel Veillard496a1cf2000-05-03 14:20:55 +000065#ifdef LIBXML_ICONV_ENABLED
66#if 0
67#define DEBUG_ENCODING /* Define this to get encoding traces */
68#endif
69#endif
70
71static int xmlLittleEndian = 1;
72
Daniel Veillard0ba4d531998-11-01 19:34:31 +000073/*
74 * From rfc2044: encoding of the Unicode values on UTF-8:
75 *
76 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
77 * 0000 0000-0000 007F 0xxxxxxx
78 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
79 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
80 *
81 * I hope we won't use values > 0xFFFF anytime soon !
82 */
83
Daniel Veillard97b58771998-10-20 06:14:16 +000084/**
Daniel Veillardbe803962000-06-28 23:40:59 +000085 * xmlGetUTF8Char:
86 * @utf: a sequence of UTF-8 encoded bytes
87 * @len: a pointer to @bytes len
88 *
89 * Read one UTF8 Char from @utf
90 *
91 * Returns the char value or -1 in case of error and update @len with the
92 * number of bytes used
93 */
94int
95xmlGetUTF8Char(const unsigned char *utf, int *len) {
96 unsigned int c;
97
98 if (utf == NULL)
99 goto error;
100 if (len == NULL)
101 goto error;
102 if (*len < 1)
103 goto error;
104
105 c = utf[0];
106 if (c & 0x80) {
107 if (*len < 2)
108 goto error;
109 if ((utf[1] & 0xc0) != 0x80)
110 goto error;
111 if ((c & 0xe0) == 0xe0) {
112 if (*len < 3)
113 goto error;
114 if ((utf[2] & 0xc0) != 0x80)
115 goto error;
116 if ((c & 0xf0) == 0xf0) {
117 if (*len < 4)
118 goto error;
119 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
120 goto error;
121 *len = 4;
122 /* 4-byte code */
123 c = (utf[0] & 0x7) << 18;
124 c |= (utf[1] & 0x3f) << 12;
125 c |= (utf[2] & 0x3f) << 6;
126 c |= utf[3] & 0x3f;
127 } else {
128 /* 3-byte code */
129 *len = 3;
130 c = (utf[0] & 0xf) << 12;
131 c |= (utf[1] & 0x3f) << 6;
132 c |= utf[2] & 0x3f;
133 }
134 } else {
135 /* 2-byte code */
136 *len = 2;
137 c = (utf[0] & 0x1f) << 6;
138 c |= utf[1] & 0x3f;
139 }
140 } else {
141 /* 1-byte code */
142 *len = 1;
143 }
144 return(c);
145
146error:
147 *len = 0;
148 return(-1);
149}
150
151/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000152 * xmlCheckUTF8: Check utf-8 string for legality.
153 * @utf: Pointer to putative utf-8 encoded string.
154 *
155 * Checks @utf for being valid utf-8. @utf is assumed to be
156 * null-terminated. This function is not super-strict, as it will
157 * allow longer utf-8 sequences than necessary. Note that Java is
158 * capable of producing these sequences if provoked. Also note, this
159 * routine checks for the 4-byte maxiumum size, but does not check for
160 * 0x10ffff maximum value.
161 *
162 * Return value: true if @utf is valid.
163 **/
164int
165xmlCheckUTF8(const unsigned char *utf)
166{
167 int ix;
168 unsigned char c;
169
170 for (ix = 0; (c = utf[ix]);) {
171 if (c & 0x80) {
172 if ((utf[ix + 1] & 0xc0) != 0x80)
173 return(0);
174 if ((c & 0xe0) == 0xe0) {
175 if ((utf[ix + 2] & 0xc0) != 0x80)
176 return(0);
177 if ((c & 0xf0) == 0xf0) {
178 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
179 return(0);
180 ix += 4;
181 /* 4-byte code */
182 } else
183 /* 3-byte code */
184 ix += 3;
185 } else
186 /* 2-byte code */
187 ix += 2;
188 } else
189 /* 1-byte code */
190 ix++;
191 }
192 return(1);
193}
194
195/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000196 * asciiToUTF8:
197 * @out: a pointer to an array of bytes to store the result
198 * @outlen: the length of @out
199 * @in: a pointer to an array of ASCII chars
200 * @inlen: the length of @in
201 *
202 * Take a block of ASCII chars in and try to convert it to an UTF-8
203 * block of chars out.
204 * Returns 0 if success, or -1 otherwise
205 * The value of @inlen after return is the number of octets consumed
206 * as the return value is positive, else unpredictiable.
207 * The value of @outlen after return is the number of ocetes consumed.
208 */
209int
210asciiToUTF8(unsigned char* out, int *outlen,
211 const unsigned char* in, int *inlen) {
212 unsigned char* outstart = out;
213 const unsigned char* base = in;
214 const unsigned char* processed = in;
215 unsigned char* outend = out + *outlen;
216 const unsigned char* inend;
217 unsigned int c;
218 int bits;
219
220 inend = in + (*inlen);
221 while ((in < inend) && (out - outstart + 5 < *outlen)) {
222 c= *in++;
223
224 /* assertion: c is a single UTF-4 value */
225 if (out >= outend)
226 break;
227 if (c < 0x80) { *out++= c; bits= -6; }
228 else {
229 *outlen = out - outstart;
230 *inlen = processed - base;
231 return(-1);
232 }
233
234 for ( ; bits >= 0; bits-= 6) {
235 if (out >= outend)
236 break;
237 *out++= ((c >> bits) & 0x3F) | 0x80;
238 }
239 processed = (const unsigned char*) in;
240 }
241 *outlen = out - outstart;
242 *inlen = processed - base;
243 return(0);
244}
245
246/**
247 * UTF8Toascii:
248 * @out: a pointer to an array of bytes to store the result
249 * @outlen: the length of @out
250 * @in: a pointer to an array of UTF-8 chars
251 * @inlen: the length of @in
252 *
253 * Take a block of UTF-8 chars in and try to convert it to an ASCII
254 * block of chars out.
255 *
256 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
257 * The value of @inlen after return is the number of octets consumed
258 * as the return value is positive, else unpredictiable.
259 * The value of @outlen after return is the number of ocetes consumed.
260 */
261int
262UTF8Toascii(unsigned char* out, int *outlen,
263 const unsigned char* in, int *inlen) {
264 const unsigned char* processed = in;
265 const unsigned char* outend;
266 const unsigned char* outstart = out;
267 const unsigned char* instart = in;
268 const unsigned char* inend;
269 unsigned int c, d;
270 int trailing;
271
272 if (in == NULL) {
273 /*
274 * initialization nothing to do
275 */
276 *outlen = 0;
277 *inlen = 0;
278 return(0);
279 }
280 inend = in + (*inlen);
281 outend = out + (*outlen);
282 while (in < inend) {
283 d = *in++;
284 if (d < 0x80) { c= d; trailing= 0; }
285 else if (d < 0xC0) {
286 /* trailing byte in leading position */
287 *outlen = out - outstart;
288 *inlen = processed - instart;
289 return(-2);
290 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
291 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
292 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
293 else {
294 /* no chance for this in Ascii */
295 *outlen = out - outstart;
296 *inlen = processed - instart;
297 return(-2);
298 }
299
300 if (inend - in < trailing) {
301 break;
302 }
303
304 for ( ; trailing; trailing--) {
305 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
306 break;
307 c <<= 6;
308 c |= d & 0x3F;
309 }
310
311 /* assertion: c is a single UTF-4 value */
312 if (c < 0x80) {
313 if (out >= outend)
314 break;
315 *out++ = c;
316 } else {
317 /* no chance for this in Ascii */
318 *outlen = out - outstart;
319 *inlen = processed - instart;
320 return(-2);
321 }
322 processed = in;
323 }
324 *outlen = out - outstart;
325 *inlen = processed - instart;
326 return(0);
327}
328
329/**
Daniel Veillard97b58771998-10-20 06:14:16 +0000330 * isolat1ToUTF8:
Daniel Veillard7f858501999-11-17 17:32:38 +0000331 * @out: a pointer to an array of bytes to store the result
332 * @outlen: the length of @out
333 * @in: a pointer to an array of ISO Latin 1 chars
334 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000335 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000336 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
337 * block of chars out.
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000338 * Returns 0 if success, or -1 otherwise
339 * The value of @inlen after return is the number of octets consumed
340 * as the return value is positive, else unpredictiable.
341 * The value of @outlen after return is the number of ocetes consumed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000342 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000343int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000344isolat1ToUTF8(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000345 const unsigned char* in, int *inlen) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000346 unsigned char* outstart = out;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000347 const unsigned char* base = in;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000348 const unsigned char* processed = in;
349 unsigned char* outend = out + *outlen;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000350 const unsigned char* inend;
351 unsigned int c;
352 int bits;
Daniel Veillard891e4041998-10-19 00:43:02 +0000353
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000354 inend = in + (*inlen);
355 while ((in < inend) && (out - outstart + 5 < *outlen)) {
356 c= *in++;
357
358 /* assertion: c is a single UTF-4 value */
359 if (out >= outend)
360 break;
361 if (c < 0x80) { *out++= c; bits= -6; }
362 else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
363
364 for ( ; bits >= 0; bits-= 6) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000365 if (out >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000366 break;
367 *out++= ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard891e4041998-10-19 00:43:02 +0000368 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000369 processed = (const unsigned char*) in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000370 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000371 *outlen = out - outstart;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000372 *inlen = processed - base;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000373 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000374}
375
Daniel Veillard97b58771998-10-20 06:14:16 +0000376/**
377 * UTF8Toisolat1:
Daniel Veillard7f858501999-11-17 17:32:38 +0000378 * @out: a pointer to an array of bytes to store the result
379 * @outlen: the length of @out
380 * @in: a pointer to an array of UTF-8 chars
381 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000382 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000383 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
384 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000385 *
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000386 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
Daniel Veillardcf461992000-03-14 18:30:20 +0000387 * The value of @inlen after return is the number of octets consumed
388 * as the return value is positive, else unpredictiable.
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000389 * The value of @outlen after return is the number of ocetes consumed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000390 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000391int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000392UTF8Toisolat1(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000393 const unsigned char* in, int *inlen) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000394 const unsigned char* processed = in;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000395 const unsigned char* outend;
396 const unsigned char* outstart = out;
397 const unsigned char* instart = in;
398 const unsigned char* inend;
399 unsigned int c, d;
400 int trailing;
Daniel Veillard891e4041998-10-19 00:43:02 +0000401
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000402 if (in == NULL) {
403 /*
404 * initialization nothing to do
405 */
406 *outlen = 0;
407 *inlen = 0;
408 return(0);
409 }
410 inend = in + (*inlen);
411 outend = out + (*outlen);
Daniel Veillard891e4041998-10-19 00:43:02 +0000412 while (in < inend) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000413 d = *in++;
414 if (d < 0x80) { c= d; trailing= 0; }
415 else if (d < 0xC0) {
416 /* trailing byte in leading position */
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000417 *outlen = out - outstart;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000418 *inlen = processed - instart;
419 return(-2);
420 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
421 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
422 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
423 else {
424 /* no chance for this in IsoLat1 */
425 *outlen = out - outstart;
426 *inlen = processed - instart;
427 return(-2);
428 }
429
430 if (inend - in < trailing) {
431 break;
432 }
433
434 for ( ; trailing; trailing--) {
Daniel Veillard87b95392000-08-12 21:12:04 +0000435 if (in >= inend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000436 break;
Daniel Veillard87b95392000-08-12 21:12:04 +0000437 if (((d= *in++) & 0xC0) != 0x80) {
438 *outlen = out - outstart;
439 *inlen = processed - instart;
440 return(-2);
441 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000442 c <<= 6;
443 c |= d & 0x3F;
444 }
445
446 /* assertion: c is a single UTF-4 value */
447 if (c <= 0xFF) {
448 if (out >= outend)
449 break;
450 *out++ = c;
451 } else {
452 /* no chance for this in IsoLat1 */
453 *outlen = out - outstart;
454 *inlen = processed - instart;
Daniel Veillardcf461992000-03-14 18:30:20 +0000455 return(-2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000456 }
457 processed = in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000458 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000459 *outlen = out - outstart;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000460 *inlen = processed - instart;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000461 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000462}
463
Daniel Veillard97b58771998-10-20 06:14:16 +0000464/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000465 * UTF16LEToUTF8:
Daniel Veillard7f858501999-11-17 17:32:38 +0000466 * @out: a pointer to an array of bytes to store the result
467 * @outlen: the length of @out
Daniel Veillardcf461992000-03-14 18:30:20 +0000468 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
469 * @inlenb: the length of @in in UTF-16LE chars
Daniel Veillard97b58771998-10-20 06:14:16 +0000470 *
Daniel Veillardcf461992000-03-14 18:30:20 +0000471 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
472 * block of chars out. This function assume the endian properity
473 * is the same between the native type of this machine and the
474 * inputed one.
475 *
476 * Returns the number of byte written, or -1 by lack of space, or -2
477 * if the transcoding fails (for *in is not valid utf16 string)
478 * The value of *inlen after return is the number of octets consumed
479 * as the return value is positive, else unpredictiable.
Daniel Veillard891e4041998-10-19 00:43:02 +0000480 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000481int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000482UTF16LEToUTF8(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000483 const unsigned char* inb, int *inlenb)
Daniel Veillard891e4041998-10-19 00:43:02 +0000484{
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000485 unsigned char* outstart = out;
486 const unsigned char* processed = inb;
487 unsigned char* outend = out + *outlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000488 unsigned short* in = (unsigned short*) inb;
489 unsigned short* inend;
490 unsigned int c, d, inlen;
491 unsigned char *tmp;
Daniel Veillard891e4041998-10-19 00:43:02 +0000492 int bits;
493
Daniel Veillardcf461992000-03-14 18:30:20 +0000494 if ((*inlenb % 2) == 1)
495 (*inlenb)--;
496 inlen = *inlenb / 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000497 inend = in + inlen;
Daniel Veillardbe803962000-06-28 23:40:59 +0000498 while ((in < inend) && (out - outstart + 5 < *outlen)) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000499 if (xmlLittleEndian) {
500 c= *in++;
501 } else {
502 tmp = (unsigned char *) in;
503 c = *tmp++;
504 c = c | (((unsigned int)*tmp) << 8);
505 in++;
506 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000507 if ((c & 0xFC00) == 0xD800) { /* surrogates */
Daniel Veillardbe803962000-06-28 23:40:59 +0000508 if (in >= inend) { /* (in > inend) shouldn't happens */
509 break;
510 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000511 if (xmlLittleEndian) {
512 d = *in++;
513 } else {
514 tmp = (unsigned char *) in;
515 d = *tmp++;
516 d = d | (((unsigned int)*tmp) << 8);
517 in++;
518 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000519 if ((d & 0xFC00) == 0xDC00) {
Daniel Veillard891e4041998-10-19 00:43:02 +0000520 c &= 0x03FF;
521 c <<= 10;
522 c |= d & 0x03FF;
523 c += 0x10000;
524 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000525 else {
526 *outlen = out - outstart;
527 *inlenb = processed - inb;
Daniel Veillardcf461992000-03-14 18:30:20 +0000528 return(-2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000529 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000530 }
531
Daniel Veillardcf461992000-03-14 18:30:20 +0000532 /* assertion: c is a single UTF-4 value */
533 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000534 break;
Daniel Veillard891e4041998-10-19 00:43:02 +0000535 if (c < 0x80) { *out++= c; bits= -6; }
Daniel Veillardcf461992000-03-14 18:30:20 +0000536 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
537 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
538 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillard891e4041998-10-19 00:43:02 +0000539
Daniel Veillardcf461992000-03-14 18:30:20 +0000540 for ( ; bits >= 0; bits-= 6) {
541 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000542 break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000543 *out++= ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard891e4041998-10-19 00:43:02 +0000544 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000545 processed = (const unsigned char*) in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000546 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000547 *outlen = out - outstart;
548 *inlenb = processed - inb;
549 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000550}
551
Daniel Veillard97b58771998-10-20 06:14:16 +0000552/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000553 * UTF8ToUTF16LE:
554 * @outb: a pointer to an array of bytes to store the result
555 * @outlen: the length of @outb
Daniel Veillard7f858501999-11-17 17:32:38 +0000556 * @in: a pointer to an array of UTF-8 chars
557 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000558 *
Daniel Veillardcf461992000-03-14 18:30:20 +0000559 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
Daniel Veillard891e4041998-10-19 00:43:02 +0000560 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000561 *
Daniel Veillard1e346af1999-02-22 10:33:01 +0000562 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillardcf461992000-03-14 18:30:20 +0000563 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000564 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000565int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000566UTF8ToUTF16LE(unsigned char* outb, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000567 const unsigned char* in, int *inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000568{
Daniel Veillardcf461992000-03-14 18:30:20 +0000569 unsigned short* out = (unsigned short*) outb;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000570 const unsigned char* processed = in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000571 unsigned short* outstart= out;
Daniel Veillardcf461992000-03-14 18:30:20 +0000572 unsigned short* outend;
573 const unsigned char* inend= in+*inlen;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000574 unsigned int c, d;
575 int trailing;
Daniel Veillardcf461992000-03-14 18:30:20 +0000576 unsigned char *tmp;
577 unsigned short tmp1, tmp2;
Daniel Veillard891e4041998-10-19 00:43:02 +0000578
Daniel Veillardbe803962000-06-28 23:40:59 +0000579 if (in == NULL) {
580 /*
581 * initialization, add the Byte Order Mark
582 */
583 if (*outlen >= 2) {
584 outb[0] = 0xFF;
585 outb[1] = 0xFE;
586 *outlen = 2;
587 *inlen = 0;
588#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000589 xmlGenericError(xmlGenericErrorContext,
590 "Added FFFE Byte Order Mark\n");
Daniel Veillardbe803962000-06-28 23:40:59 +0000591#endif
592 return(2);
593 }
594 *outlen = 0;
595 *inlen = 0;
596 return(0);
597 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000598 outend = out + (*outlen / 2);
Daniel Veillard891e4041998-10-19 00:43:02 +0000599 while (in < inend) {
600 d= *in++;
601 if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000602 else if (d < 0xC0) {
603 /* trailing byte in leading position */
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000604 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000605 *inlen = processed - in;
606 return(-2);
607 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
Daniel Veillard891e4041998-10-19 00:43:02 +0000608 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
609 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000610 else {
611 /* no chance for this in UTF-16 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000612 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000613 *inlen = processed - in;
614 return(-2);
615 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000616
617 if (inend - in < trailing) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000618 break;
619 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000620
621 for ( ; trailing; trailing--) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000622 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000623 break;
Daniel Veillard891e4041998-10-19 00:43:02 +0000624 c <<= 6;
625 c |= d & 0x3F;
626 }
627
628 /* assertion: c is a single UTF-4 value */
629 if (c < 0x10000) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000630 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000631 break;
632 if (xmlLittleEndian) {
633 *out++ = c;
634 } else {
635 tmp = (unsigned char *) out;
636 *tmp = c ;
637 *(tmp + 1) = c >> 8 ;
638 out++;
639 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000640 }
641 else if (c < 0x110000) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000642 if (out+1 >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000643 break;
Daniel Veillard891e4041998-10-19 00:43:02 +0000644 c -= 0x10000;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000645 if (xmlLittleEndian) {
646 *out++ = 0xD800 | (c >> 10);
647 *out++ = 0xDC00 | (c & 0x03FF);
648 } else {
649 tmp1 = 0xD800 | (c >> 10);
650 tmp = (unsigned char *) out;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000651 *tmp = (unsigned char) tmp1;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000652 *(tmp + 1) = tmp1 >> 8;
653 out++;
Daniel Veillardcf461992000-03-14 18:30:20 +0000654
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000655 tmp2 = 0xDC00 | (c & 0x03FF);
656 tmp = (unsigned char *) out;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000657 *tmp = (unsigned char) tmp2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000658 *(tmp + 1) = tmp2 >> 8;
659 out++;
660 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000661 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000662 else
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000663 break;
664 processed = in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000665 }
Daniel Veillardbe803962000-06-28 23:40:59 +0000666 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000667 *inlen = processed - in;
668 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000669}
670
Daniel Veillardcf461992000-03-14 18:30:20 +0000671/**
672 * UTF16BEToUTF8:
673 * @out: a pointer to an array of bytes to store the result
674 * @outlen: the length of @out
675 * @inb: a pointer to an array of UTF-16 passwd as a byte array
676 * @inlenb: the length of @in in UTF-16 chars
677 *
678 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
679 * block of chars out. This function assume the endian properity
680 * is the same between the native type of this machine and the
681 * inputed one.
682 *
683 * Returns the number of byte written, or -1 by lack of space, or -2
684 * if the transcoding fails (for *in is not valid utf16 string)
685 * The value of *inlen after return is the number of octets consumed
686 * as the return value is positive, else unpredictiable.
687 */
688int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000689UTF16BEToUTF8(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000690 const unsigned char* inb, int *inlenb)
691{
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000692 unsigned char* outstart = out;
693 const unsigned char* processed = inb;
694 unsigned char* outend = out + *outlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000695 unsigned short* in = (unsigned short*) inb;
696 unsigned short* inend;
697 unsigned int c, d, inlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000698 unsigned char *tmp;
Daniel Veillardcf461992000-03-14 18:30:20 +0000699 int bits;
700
701 if ((*inlenb % 2) == 1)
702 (*inlenb)--;
703 inlen = *inlenb / 2;
704 inend= in + inlen;
705 while (in < inend) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000706 if (xmlLittleEndian) {
707 tmp = (unsigned char *) in;
708 c = *tmp++;
709 c = c << 8;
710 c = c | (unsigned int) *tmp;
711 in++;
712 } else {
713 c= *in++;
714 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000715 if ((c & 0xFC00) == 0xD800) { /* surrogates */
716 if (in >= inend) { /* (in > inend) shouldn't happens */
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000717 *outlen = out - outstart;
718 *inlenb = processed - inb;
719 return(-2);
Daniel Veillardcf461992000-03-14 18:30:20 +0000720 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000721 if (xmlLittleEndian) {
722 tmp = (unsigned char *) in;
723 d = *tmp++;
724 d = d << 8;
725 d = d | (unsigned int) *tmp;
726 in++;
727 } else {
728 d= *in++;
729 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000730 if ((d & 0xFC00) == 0xDC00) {
731 c &= 0x03FF;
732 c <<= 10;
733 c |= d & 0x03FF;
734 c += 0x10000;
735 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000736 else {
737 *outlen = out - outstart;
738 *inlenb = processed - inb;
Daniel Veillardcf461992000-03-14 18:30:20 +0000739 return(-2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000740 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000741 }
742
743 /* assertion: c is a single UTF-4 value */
744 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000745 break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000746 if (c < 0x80) { *out++= c; bits= -6; }
747 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
748 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
749 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
750
751 for ( ; bits >= 0; bits-= 6) {
752 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000753 break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000754 *out++= ((c >> bits) & 0x3F) | 0x80;
755 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000756 processed = (const unsigned char*) in;
Daniel Veillardcf461992000-03-14 18:30:20 +0000757 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000758 *outlen = out - outstart;
759 *inlenb = processed - inb;
760 return(0);
Daniel Veillardcf461992000-03-14 18:30:20 +0000761}
762
763/**
764 * UTF8ToUTF16BE:
765 * @outb: a pointer to an array of bytes to store the result
766 * @outlen: the length of @outb
767 * @in: a pointer to an array of UTF-8 chars
768 * @inlen: the length of @in
769 *
770 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
771 * block of chars out.
Daniel Veillardcf461992000-03-14 18:30:20 +0000772 *
773 * Returns the number of byte written, or -1 by lack of space, or -2
774 * if the transcoding failed.
775 */
776int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000777UTF8ToUTF16BE(unsigned char* outb, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000778 const unsigned char* in, int *inlen)
779{
780 unsigned short* out = (unsigned short*) outb;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000781 const unsigned char* processed = in;
Daniel Veillardcf461992000-03-14 18:30:20 +0000782 unsigned short* outstart= out;
783 unsigned short* outend;
784 const unsigned char* inend= in+*inlen;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000785 unsigned int c, d;
786 int trailing;
Daniel Veillardcf461992000-03-14 18:30:20 +0000787 unsigned char *tmp;
788 unsigned short tmp1, tmp2;
Daniel Veillardcf461992000-03-14 18:30:20 +0000789
Daniel Veillardbe803962000-06-28 23:40:59 +0000790 if (in == NULL) {
791 /*
792 * initialization, add the Byte Order Mark
793 */
794 if (*outlen >= 2) {
795 outb[0] = 0xFE;
796 outb[1] = 0xFF;
797 *outlen = 2;
798 *inlen = 0;
799#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +0000800 xmlGenericError(xmlGenericErrorContext,
801 "Added FEFF Byte Order Mark\n");
Daniel Veillardbe803962000-06-28 23:40:59 +0000802#endif
803 return(2);
804 }
805 *outlen = 0;
806 *inlen = 0;
807 return(0);
808 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000809 outend = out + (*outlen / 2);
Daniel Veillardcf461992000-03-14 18:30:20 +0000810 while (in < inend) {
811 d= *in++;
812 if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000813 else if (d < 0xC0) {
814 /* trailing byte in leading position */
815 *outlen = out - outstart;
816 *inlen = processed - in;
817 return(-2);
818 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
Daniel Veillardcf461992000-03-14 18:30:20 +0000819 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
820 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000821 else {
822 /* no chance for this in UTF-16 */
823 *outlen = out - outstart;
824 *inlen = processed - in;
825 return(-2);
826 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000827
828 if (inend - in < trailing) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000829 break;
830 }
831
832 for ( ; trailing; trailing--) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000833 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000834 c <<= 6;
835 c |= d & 0x3F;
836 }
837
838 /* assertion: c is a single UTF-4 value */
839 if (c < 0x10000) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000840 if (out >= outend) break;
841 if (xmlLittleEndian) {
842 tmp = (unsigned char *) out;
843 *tmp = c >> 8;
844 *(tmp + 1) = c;
845 out++;
846 } else {
847 *out++ = c;
848 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000849 }
850 else if (c < 0x110000) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000851 if (out+1 >= outend) break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000852 c -= 0x10000;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000853 if (xmlLittleEndian) {
854 tmp1 = 0xD800 | (c >> 10);
855 tmp = (unsigned char *) out;
856 *tmp = tmp1 >> 8;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000857 *(tmp + 1) = (unsigned char) tmp1;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000858 out++;
Daniel Veillardcf461992000-03-14 18:30:20 +0000859
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000860 tmp2 = 0xDC00 | (c & 0x03FF);
861 tmp = (unsigned char *) out;
862 *tmp = tmp2 >> 8;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000863 *(tmp + 1) = (unsigned char) tmp2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000864 out++;
865 } else {
866 *out++ = 0xD800 | (c >> 10);
867 *out++ = 0xDC00 | (c & 0x03FF);
868 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000869 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000870 else
871 break;
872 processed = in;
Daniel Veillardcf461992000-03-14 18:30:20 +0000873 }
Daniel Veillardbe803962000-06-28 23:40:59 +0000874 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000875 *inlen = processed - in;
876 return(0);
Daniel Veillardcf461992000-03-14 18:30:20 +0000877}
Daniel Veillard97b58771998-10-20 06:14:16 +0000878
Daniel Veillard27d88741999-05-29 11:51:49 +0000879/**
880 * xmlDetectCharEncoding:
881 * @in: a pointer to the first bytes of the XML entity, must be at least
882 * 4 bytes long.
Daniel Veillardcf461992000-03-14 18:30:20 +0000883 * @len: pointer to the length of the buffer
Daniel Veillard27d88741999-05-29 11:51:49 +0000884 *
885 * Guess the encoding of the entity using the first bytes of the entity content
886 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
887 *
888 * Returns one of the XML_CHAR_ENCODING_... values.
889 */
890xmlCharEncoding
Daniel Veillardcf461992000-03-14 18:30:20 +0000891xmlDetectCharEncoding(const unsigned char* in, int len)
Daniel Veillard27d88741999-05-29 11:51:49 +0000892{
Daniel Veillardcf461992000-03-14 18:30:20 +0000893 if (len >= 4) {
894 if ((in[0] == 0x00) && (in[1] == 0x00) &&
895 (in[2] == 0x00) && (in[3] == 0x3C))
896 return(XML_CHAR_ENCODING_UCS4BE);
897 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
898 (in[2] == 0x00) && (in[3] == 0x00))
899 return(XML_CHAR_ENCODING_UCS4LE);
900 if ((in[0] == 0x00) && (in[1] == 0x00) &&
901 (in[2] == 0x3C) && (in[3] == 0x00))
902 return(XML_CHAR_ENCODING_UCS4_2143);
903 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
904 (in[2] == 0x00) && (in[3] == 0x00))
905 return(XML_CHAR_ENCODING_UCS4_3412);
906 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
907 (in[2] == 0xA7) && (in[3] == 0x94))
908 return(XML_CHAR_ENCODING_EBCDIC);
909 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
910 (in[2] == 0x78) && (in[3] == 0x6D))
911 return(XML_CHAR_ENCODING_UTF8);
912 }
913 if (len >= 2) {
914 if ((in[0] == 0xFE) && (in[1] == 0xFF))
915 return(XML_CHAR_ENCODING_UTF16BE);
916 if ((in[0] == 0xFF) && (in[1] == 0xFE))
917 return(XML_CHAR_ENCODING_UTF16LE);
918 }
Daniel Veillard27d88741999-05-29 11:51:49 +0000919 return(XML_CHAR_ENCODING_NONE);
920}
921
922/**
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +0000923 * xmlCleanupEncodingAliases:
924 *
925 * Unregisters all aliases
926 */
927void
928xmlCleanupEncodingAliases(void) {
929 int i;
930
931 if (xmlCharEncodingAliases == NULL)
932 return;
933
934 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
935 if (xmlCharEncodingAliases[i].name != NULL)
936 xmlFree((char *) xmlCharEncodingAliases[i].name);
937 if (xmlCharEncodingAliases[i].alias != NULL)
938 xmlFree((char *) xmlCharEncodingAliases[i].alias);
939 }
940 xmlCharEncodingAliasesNb = 0;
941 xmlCharEncodingAliasesMax = 0;
942 xmlFree(xmlCharEncodingAliases);
943}
944
945/**
946 * xmlGetEncodingAlias:
947 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
948 *
949 * Lookup an encoding name for the given alias.
950 *
951 * Returns NULL if not found the original name otherwise
952 */
953const char *
954xmlGetEncodingAlias(const char *alias) {
955 int i;
956 char upper[100];
957
958 if (alias == NULL)
959 return(NULL);
960
961 if (xmlCharEncodingAliases == NULL)
962 return(NULL);
963
964 for (i = 0;i < 99;i++) {
965 upper[i] = toupper(alias[i]);
966 if (upper[i] == 0) break;
967 }
968 upper[i] = 0;
969
970 /*
971 * Walk down the list looking for a definition of the alias
972 */
973 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
974 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
975 return(xmlCharEncodingAliases[i].name);
976 }
977 }
978 return(NULL);
979}
980
981/**
982 * xmlAddEncodingAlias:
983 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
984 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
985 *
986 * Registers and alias @alias for an encoding named @name. Existing alias
987 * will be overwritten.
988 *
989 * Returns 0 in case of success, -1 in case of error
990 */
991int
992xmlAddEncodingAlias(const char *name, const char *alias) {
993 int i;
994 char upper[100];
995
996 if ((name == NULL) || (alias == NULL))
997 return(-1);
998
999 for (i = 0;i < 99;i++) {
1000 upper[i] = toupper(alias[i]);
1001 if (upper[i] == 0) break;
1002 }
1003 upper[i] = 0;
1004
1005 if (xmlCharEncodingAliases == NULL) {
1006 xmlCharEncodingAliasesNb = 0;
1007 xmlCharEncodingAliasesMax = 20;
1008 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1009 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1010 if (xmlCharEncodingAliases == NULL)
1011 return(-1);
1012 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1013 xmlCharEncodingAliasesMax *= 2;
1014 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1015 xmlRealloc(xmlCharEncodingAliases,
1016 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1017 }
1018 /*
1019 * Walk down the list looking for a definition of the alias
1020 */
1021 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1022 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1023 /*
1024 * Replace the definition.
1025 */
1026 xmlFree((char *) xmlCharEncodingAliases[i].name);
1027 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1028 return(0);
1029 }
1030 }
1031 /*
1032 * Add the definition
1033 */
1034 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1035 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1036 xmlCharEncodingAliasesNb++;
1037 return(0);
1038}
1039
1040/**
1041 * xmlDelEncodingAlias:
1042 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1043 *
1044 * Unregisters an encoding alias @alias
1045 *
1046 * Returns 0 in case of success, -1 in case of error
1047 */
1048int
1049xmlDelEncodingAlias(const char *alias) {
1050 int i;
1051
1052 if (alias == NULL)
1053 return(-1);
1054
1055 if (xmlCharEncodingAliases == NULL)
1056 return(-1);
1057 /*
1058 * Walk down the list looking for a definition of the alias
1059 */
1060 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1061 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1062 xmlFree((char *) xmlCharEncodingAliases[i].name);
1063 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1064 xmlCharEncodingAliasesNb--;
1065 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1066 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1067 return(0);
1068 }
1069 }
1070 return(-1);
1071}
1072
1073/**
Daniel Veillard27d88741999-05-29 11:51:49 +00001074 * xmlParseCharEncoding:
Daniel Veillard7f858501999-11-17 17:32:38 +00001075 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
Daniel Veillard27d88741999-05-29 11:51:49 +00001076 *
1077 * Conpare the string to the known encoding schemes already known. Note
1078 * that the comparison is case insensitive accordingly to the section
1079 * [XML] 4.3.3 Character Encoding in Entities.
1080 *
1081 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1082 * if not recognized.
1083 */
1084xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +00001085xmlParseCharEncoding(const char* name)
Daniel Veillard27d88741999-05-29 11:51:49 +00001086{
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001087 const char *alias;
Daniel Veillard27d88741999-05-29 11:51:49 +00001088 char upper[500];
1089 int i;
1090
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001091 if (name == NULL)
1092 return(XML_CHAR_ENCODING_NONE);
1093
1094 /*
1095 * Do the alias resolution
1096 */
1097 alias = xmlGetEncodingAlias(name);
1098 if (alias != NULL)
1099 name = alias;
1100
Daniel Veillard27d88741999-05-29 11:51:49 +00001101 for (i = 0;i < 499;i++) {
1102 upper[i] = toupper(name[i]);
1103 if (upper[i] == 0) break;
1104 }
1105 upper[i] = 0;
1106
1107 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1108 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1109 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1110
1111 /*
1112 * NOTE: if we were able to parse this, the endianness of UTF16 is
1113 * already found and in use
1114 */
1115 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1116 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1117
1118 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1119 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1120 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1121
1122 /*
1123 * NOTE: if we were able to parse this, the endianness of UCS4 is
1124 * already found and in use
1125 */
1126 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1127 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1128 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1129
1130
1131 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1132 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1133 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1134
1135 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1136 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1137 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1138
1139 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1140 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1141 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1142 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1143 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1144 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1145 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1146
1147 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001148 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
Daniel Veillard27d88741999-05-29 11:51:49 +00001149 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001150
1151#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001152 xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001153#endif
Daniel Veillard27d88741999-05-29 11:51:49 +00001154 return(XML_CHAR_ENCODING_ERROR);
1155}
Daniel Veillard14fff061999-06-22 21:49:07 +00001156
Daniel Veillardbe803962000-06-28 23:40:59 +00001157/**
1158 * xmlGetCharEncodingName:
1159 * @enc: the encoding
1160 *
1161 * The "canonical" name for XML encoding.
1162 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1163 * Section 4.3.3 Character Encoding in Entities
1164 *
1165 * Returns the canonical name for the given encoding
1166 */
1167
1168const char*
1169xmlGetCharEncodingName(xmlCharEncoding enc) {
1170 switch (enc) {
1171 case XML_CHAR_ENCODING_ERROR:
1172 return(NULL);
1173 case XML_CHAR_ENCODING_NONE:
1174 return(NULL);
1175 case XML_CHAR_ENCODING_UTF8:
1176 return("UTF-8");
1177 case XML_CHAR_ENCODING_UTF16LE:
1178 return("UTF-16");
1179 case XML_CHAR_ENCODING_UTF16BE:
1180 return("UTF-16");
1181 case XML_CHAR_ENCODING_EBCDIC:
1182 return("EBCDIC");
1183 case XML_CHAR_ENCODING_UCS4LE:
1184 return("ISO-10646-UCS-4");
1185 case XML_CHAR_ENCODING_UCS4BE:
1186 return("ISO-10646-UCS-4");
1187 case XML_CHAR_ENCODING_UCS4_2143:
1188 return("ISO-10646-UCS-4");
1189 case XML_CHAR_ENCODING_UCS4_3412:
1190 return("ISO-10646-UCS-4");
1191 case XML_CHAR_ENCODING_UCS2:
1192 return("ISO-10646-UCS-2");
1193 case XML_CHAR_ENCODING_8859_1:
1194 return("ISO-8859-1");
1195 case XML_CHAR_ENCODING_8859_2:
1196 return("ISO-8859-2");
1197 case XML_CHAR_ENCODING_8859_3:
1198 return("ISO-8859-3");
1199 case XML_CHAR_ENCODING_8859_4:
1200 return("ISO-8859-4");
1201 case XML_CHAR_ENCODING_8859_5:
1202 return("ISO-8859-5");
1203 case XML_CHAR_ENCODING_8859_6:
1204 return("ISO-8859-6");
1205 case XML_CHAR_ENCODING_8859_7:
1206 return("ISO-8859-7");
1207 case XML_CHAR_ENCODING_8859_8:
1208 return("ISO-8859-8");
1209 case XML_CHAR_ENCODING_8859_9:
1210 return("ISO-8859-9");
1211 case XML_CHAR_ENCODING_2022_JP:
1212 return("ISO-2022-JP");
1213 case XML_CHAR_ENCODING_SHIFT_JIS:
1214 return("Shift-JIS");
1215 case XML_CHAR_ENCODING_EUC_JP:
1216 return("EUC-JP");
Daniel Veillard87b95392000-08-12 21:12:04 +00001217 case XML_CHAR_ENCODING_ASCII:
1218 return(NULL);
Daniel Veillardbe803962000-06-28 23:40:59 +00001219 }
1220 return(NULL);
1221}
1222
Daniel Veillard14fff061999-06-22 21:49:07 +00001223/****************************************************************
1224 * *
1225 * Char encoding handlers *
1226 * *
1227 ****************************************************************/
1228
1229/* the size should be growable, but it's not a big deal ... */
1230#define MAX_ENCODING_HANDLERS 50
1231static xmlCharEncodingHandlerPtr *handlers = NULL;
1232static int nbCharEncodingHandler = 0;
1233
1234/*
1235 * The default is UTF-8 for XML, that's also the default used for the
1236 * parser internals, so the default encoding handler is NULL
1237 */
1238
1239static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1240
1241/**
1242 * xmlNewCharEncodingHandler:
Daniel Veillard7f858501999-11-17 17:32:38 +00001243 * @name: the encoding name, in UTF-8 format (ASCII actually)
Daniel Veillard14fff061999-06-22 21:49:07 +00001244 * @input: the xmlCharEncodingInputFunc to read that encoding
1245 * @output: the xmlCharEncodingOutputFunc to write that encoding
1246 *
1247 * Create and registers an xmlCharEncodingHandler.
1248 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1249 */
1250xmlCharEncodingHandlerPtr
Daniel Veillardcf461992000-03-14 18:30:20 +00001251xmlNewCharEncodingHandler(const char *name,
1252 xmlCharEncodingInputFunc input,
Daniel Veillard14fff061999-06-22 21:49:07 +00001253 xmlCharEncodingOutputFunc output) {
1254 xmlCharEncodingHandlerPtr handler;
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001255 const char *alias;
Daniel Veillard14fff061999-06-22 21:49:07 +00001256 char upper[500];
1257 int i;
1258 char *up = 0;
1259
1260 /*
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001261 * Do the alias resolution
1262 */
1263 alias = xmlGetEncodingAlias(name);
1264 if (alias != NULL)
1265 name = alias;
1266
1267 /*
Daniel Veillard14fff061999-06-22 21:49:07 +00001268 * Keep only the uppercase version of the encoding.
1269 */
1270 if (name == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001271 xmlGenericError(xmlGenericErrorContext,
1272 "xmlNewCharEncodingHandler : no name !\n");
Daniel Veillard14fff061999-06-22 21:49:07 +00001273 return(NULL);
1274 }
1275 for (i = 0;i < 499;i++) {
1276 upper[i] = toupper(name[i]);
1277 if (upper[i] == 0) break;
1278 }
1279 upper[i] = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00001280 up = xmlMemStrdup(upper);
Daniel Veillard14fff061999-06-22 21:49:07 +00001281 if (up == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001282 xmlGenericError(xmlGenericErrorContext,
1283 "xmlNewCharEncodingHandler : out of memory !\n");
Daniel Veillard14fff061999-06-22 21:49:07 +00001284 return(NULL);
1285 }
1286
1287 /*
1288 * allocate and fill-up an handler block.
1289 */
1290 handler = (xmlCharEncodingHandlerPtr)
Daniel Veillard6454aec1999-09-02 22:04:43 +00001291 xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard14fff061999-06-22 21:49:07 +00001292 if (handler == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001293 xmlGenericError(xmlGenericErrorContext,
1294 "xmlNewCharEncodingHandler : out of memory !\n");
Daniel Veillard14fff061999-06-22 21:49:07 +00001295 return(NULL);
1296 }
1297 handler->input = input;
1298 handler->output = output;
1299 handler->name = up;
1300
Daniel Veillard87b95392000-08-12 21:12:04 +00001301#ifdef LIBXML_ICONV_ENABLED
1302 handler->iconv_in = NULL;
1303 handler->iconv_out = NULL;
1304#endif /* LIBXML_ICONV_ENABLED */
1305
Daniel Veillard14fff061999-06-22 21:49:07 +00001306 /*
1307 * registers and returns the handler.
1308 */
1309 xmlRegisterCharEncodingHandler(handler);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001310#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001311 xmlGenericError(xmlGenericErrorContext,
1312 "Registered encoding handler for %s\n", name);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001313#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001314 return(handler);
1315}
1316
1317/**
1318 * xmlInitCharEncodingHandlers:
1319 *
1320 * Initialize the char encoding support, it registers the default
1321 * encoding supported.
Daniel Veillard7f858501999-11-17 17:32:38 +00001322 * NOTE: while public, this function usually doesn't need to be called
Daniel Veillard14fff061999-06-22 21:49:07 +00001323 * in normal processing.
1324 */
1325void
1326xmlInitCharEncodingHandlers(void) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001327 unsigned short int tst = 0x1234;
1328 unsigned char *ptr = (unsigned char *) &tst;
1329
Daniel Veillard14fff061999-06-22 21:49:07 +00001330 if (handlers != NULL) return;
1331
1332 handlers = (xmlCharEncodingHandlerPtr *)
Daniel Veillard6454aec1999-09-02 22:04:43 +00001333 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
Daniel Veillard14fff061999-06-22 21:49:07 +00001334
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001335 if (*ptr == 0x12) xmlLittleEndian = 0;
1336 else if (*ptr == 0x34) xmlLittleEndian = 1;
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001337 else xmlGenericError(xmlGenericErrorContext,
1338 "Odd problem at endianness detection\n");
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001339
Daniel Veillard14fff061999-06-22 21:49:07 +00001340 if (handlers == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001341 xmlGenericError(xmlGenericErrorContext,
1342 "xmlInitCharEncodingHandlers : out of memory !\n");
Daniel Veillard14fff061999-06-22 21:49:07 +00001343 return;
1344 }
1345 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillardcf461992000-03-14 18:30:20 +00001346 xmlUTF16LEHandler =
1347 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1348 xmlUTF16BEHandler =
1349 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
Daniel Veillard14fff061999-06-22 21:49:07 +00001350 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001351 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1352#ifdef LIBXML_HTML_ENABLED
1353 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1354#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001355}
1356
1357/**
Daniel Veillarda819dac1999-11-24 18:04:22 +00001358 * xmlCleanupCharEncodingHandlers:
1359 *
1360 * Cleanup the memory allocated for the char encoding support, it
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001361 * unregisters all the encoding handlers and the aliases.
Daniel Veillarda819dac1999-11-24 18:04:22 +00001362 */
1363void
1364xmlCleanupCharEncodingHandlers(void) {
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001365 xmlCleanupEncodingAliases();
1366
Daniel Veillarda819dac1999-11-24 18:04:22 +00001367 if (handlers == NULL) return;
1368
1369 for (;nbCharEncodingHandler > 0;) {
1370 nbCharEncodingHandler--;
1371 if (handlers[nbCharEncodingHandler] != NULL) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001372 if (handlers[nbCharEncodingHandler]->name != NULL)
1373 xmlFree(handlers[nbCharEncodingHandler]->name);
Daniel Veillarda819dac1999-11-24 18:04:22 +00001374 xmlFree(handlers[nbCharEncodingHandler]);
1375 }
1376 }
1377 xmlFree(handlers);
1378 handlers = NULL;
1379 nbCharEncodingHandler = 0;
1380 xmlDefaultCharEncodingHandler = NULL;
1381}
1382
1383/**
Daniel Veillard14fff061999-06-22 21:49:07 +00001384 * xmlRegisterCharEncodingHandler:
1385 * @handler: the xmlCharEncodingHandlerPtr handler block
1386 *
1387 * Register the char encoding handler, surprizing, isn't it ?
1388 */
1389void
1390xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1391 if (handlers == NULL) xmlInitCharEncodingHandlers();
1392 if (handler == NULL) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001393 xmlGenericError(xmlGenericErrorContext,
1394 "xmlRegisterCharEncodingHandler: NULL handler !\n");
Daniel Veillard14fff061999-06-22 21:49:07 +00001395 return;
1396 }
1397
1398 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001399 xmlGenericError(xmlGenericErrorContext,
Daniel Veillard14fff061999-06-22 21:49:07 +00001400 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001401 xmlGenericError(xmlGenericErrorContext,
1402 "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
Daniel Veillard14fff061999-06-22 21:49:07 +00001403 return;
1404 }
1405 handlers[nbCharEncodingHandler++] = handler;
1406}
1407
1408/**
1409 * xmlGetCharEncodingHandler:
1410 * @enc: an xmlCharEncoding value.
1411 *
1412 * Search in the registrered set the handler able to read/write that encoding.
1413 *
1414 * Returns the handler or NULL if not found
1415 */
1416xmlCharEncodingHandlerPtr
1417xmlGetCharEncodingHandler(xmlCharEncoding enc) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001418 xmlCharEncodingHandlerPtr handler;
1419
Daniel Veillard14fff061999-06-22 21:49:07 +00001420 if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillardcf461992000-03-14 18:30:20 +00001421 switch (enc) {
1422 case XML_CHAR_ENCODING_ERROR:
1423 return(NULL);
1424 case XML_CHAR_ENCODING_NONE:
1425 return(NULL);
1426 case XML_CHAR_ENCODING_UTF8:
1427 return(NULL);
1428 case XML_CHAR_ENCODING_UTF16LE:
1429 return(xmlUTF16LEHandler);
1430 case XML_CHAR_ENCODING_UTF16BE:
1431 return(xmlUTF16BEHandler);
1432 case XML_CHAR_ENCODING_EBCDIC:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001433 handler = xmlFindCharEncodingHandler("EBCDIC");
1434 if (handler != NULL) return(handler);
1435 handler = xmlFindCharEncodingHandler("ebcdic");
1436 if (handler != NULL) return(handler);
1437 break;
Daniel Veillardbe803962000-06-28 23:40:59 +00001438 case XML_CHAR_ENCODING_UCS4BE:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001439 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1440 if (handler != NULL) return(handler);
1441 handler = xmlFindCharEncodingHandler("UCS-4");
1442 if (handler != NULL) return(handler);
1443 handler = xmlFindCharEncodingHandler("UCS4");
1444 if (handler != NULL) return(handler);
1445 break;
Daniel Veillardbe803962000-06-28 23:40:59 +00001446 case XML_CHAR_ENCODING_UCS4LE:
1447 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1448 if (handler != NULL) return(handler);
1449 handler = xmlFindCharEncodingHandler("UCS-4");
1450 if (handler != NULL) return(handler);
1451 handler = xmlFindCharEncodingHandler("UCS4");
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001452 if (handler != NULL) return(handler);
1453 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001454 case XML_CHAR_ENCODING_UCS4_2143:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001455 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001456 case XML_CHAR_ENCODING_UCS4_3412:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001457 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001458 case XML_CHAR_ENCODING_UCS2:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001459 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1460 if (handler != NULL) return(handler);
1461 handler = xmlFindCharEncodingHandler("UCS-2");
1462 if (handler != NULL) return(handler);
1463 handler = xmlFindCharEncodingHandler("UCS2");
1464 if (handler != NULL) return(handler);
1465 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001466
1467 /*
1468 * We used to keep ISO Latin encodings native in the
1469 * generated data. This led to so many problems that
1470 * this has been removed. One can still change this
1471 * back by registering no-ops encoders for those
1472 */
Daniel Veillardcf461992000-03-14 18:30:20 +00001473 case XML_CHAR_ENCODING_8859_1:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001474 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1475 if (handler != NULL) return(handler);
1476 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001477 case XML_CHAR_ENCODING_8859_2:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001478 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1479 if (handler != NULL) return(handler);
1480 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001481 case XML_CHAR_ENCODING_8859_3:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001482 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1483 if (handler != NULL) return(handler);
1484 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001485 case XML_CHAR_ENCODING_8859_4:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001486 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1487 if (handler != NULL) return(handler);
1488 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001489 case XML_CHAR_ENCODING_8859_5:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001490 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1491 if (handler != NULL) return(handler);
1492 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001493 case XML_CHAR_ENCODING_8859_6:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001494 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1495 if (handler != NULL) return(handler);
1496 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001497 case XML_CHAR_ENCODING_8859_7:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001498 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1499 if (handler != NULL) return(handler);
1500 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001501 case XML_CHAR_ENCODING_8859_8:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001502 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1503 if (handler != NULL) return(handler);
1504 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001505 case XML_CHAR_ENCODING_8859_9:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001506 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1507 if (handler != NULL) return(handler);
1508 break;
1509
1510
Daniel Veillardcf461992000-03-14 18:30:20 +00001511 case XML_CHAR_ENCODING_2022_JP:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001512 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1513 if (handler != NULL) return(handler);
1514 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001515 case XML_CHAR_ENCODING_SHIFT_JIS:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001516 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1517 if (handler != NULL) return(handler);
1518 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1519 if (handler != NULL) return(handler);
1520 handler = xmlFindCharEncodingHandler("Shift_JIS");
1521 if (handler != NULL) return(handler);
1522 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001523 case XML_CHAR_ENCODING_EUC_JP:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001524 handler = xmlFindCharEncodingHandler("EUC-JP");
1525 if (handler != NULL) return(handler);
1526 break;
1527 default:
1528 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001529 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001530
1531#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001532 xmlGenericError(xmlGenericErrorContext,
1533 "No handler found for encoding %d\n", enc);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001534#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001535 return(NULL);
1536}
1537
1538/**
1539 * xmlGetCharEncodingHandler:
1540 * @enc: a string describing the char encoding.
1541 *
1542 * Search in the registrered set the handler able to read/write that encoding.
1543 *
1544 * Returns the handler or NULL if not found
1545 */
1546xmlCharEncodingHandlerPtr
1547xmlFindCharEncodingHandler(const char *name) {
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001548 const char *nalias;
1549 const char *norig;
Daniel Veillardbe803962000-06-28 23:40:59 +00001550 xmlCharEncoding alias;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001551#ifdef LIBXML_ICONV_ENABLED
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001552 xmlCharEncodingHandlerPtr enc;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001553 iconv_t icv_in, icv_out;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001554#endif /* LIBXML_ICONV_ENABLED */
1555 char upper[100];
Daniel Veillard14fff061999-06-22 21:49:07 +00001556 int i;
1557
1558 if (handlers == NULL) xmlInitCharEncodingHandlers();
1559 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1560 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1561
Daniel Veillardbe803962000-06-28 23:40:59 +00001562 /*
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001563 * Do the alias resolution
1564 */
1565 norig = name;
1566 nalias = xmlGetEncodingAlias(name);
1567 if (nalias != NULL)
1568 name = nalias;
1569
1570 /*
Daniel Veillardbe803962000-06-28 23:40:59 +00001571 * Check first for directly registered encoding names
1572 */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001573 for (i = 0;i < 99;i++) {
Daniel Veillard14fff061999-06-22 21:49:07 +00001574 upper[i] = toupper(name[i]);
1575 if (upper[i] == 0) break;
1576 }
1577 upper[i] = 0;
1578
1579 for (i = 0;i < nbCharEncodingHandler; i++)
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001580 if (!strcmp(upper, handlers[i]->name)) {
1581#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001582 xmlGenericError(xmlGenericErrorContext,
1583 "Found registered handler for encoding %s\n", name);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001584#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001585 return(handlers[i]);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001586 }
Daniel Veillard14fff061999-06-22 21:49:07 +00001587
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001588#ifdef LIBXML_ICONV_ENABLED
1589 /* check whether iconv can handle this */
1590 icv_in = iconv_open("UTF-8", name);
1591 icv_out = iconv_open(name, "UTF-8");
1592 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001593 enc = (xmlCharEncodingHandlerPtr)
1594 xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001595 if (enc == NULL) {
1596 iconv_close(icv_in);
1597 iconv_close(icv_out);
1598 return(NULL);
1599 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00001600 enc->name = xmlMemStrdup(name);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001601 enc->input = NULL;
1602 enc->output = NULL;
1603 enc->iconv_in = icv_in;
1604 enc->iconv_out = icv_out;
1605#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001606 xmlGenericError(xmlGenericErrorContext,
1607 "Found iconv handler for encoding %s\n", name);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001608#endif
1609 return enc;
1610 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001611 xmlGenericError(xmlGenericErrorContext,
1612 "iconv : problems with filters for '%s'\n", name);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001613 }
1614#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillardbe803962000-06-28 23:40:59 +00001615
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001616#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001617 xmlGenericError(xmlGenericErrorContext,
1618 "No handler found for encoding %s\n", name);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001619#endif
Daniel Veillardbe803962000-06-28 23:40:59 +00001620
1621 /*
1622 * Fallback using the canonical names
1623 */
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001624 alias = xmlParseCharEncoding(norig);
Daniel Veillardbe803962000-06-28 23:40:59 +00001625 if (alias != XML_CHAR_ENCODING_ERROR) {
1626 const char* canon;
1627 canon = xmlGetCharEncodingName(alias);
1628 if ((canon != NULL) && (strcmp(name, canon))) {
1629 return(xmlFindCharEncodingHandler(canon));
1630 }
1631 }
1632
Daniel Veillard14fff061999-06-22 21:49:07 +00001633 return(NULL);
1634}
1635
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001636#ifdef LIBXML_ICONV_ENABLED
1637/**
1638 * xmlIconvWrapper:
1639 * @cd: iconv converter data structure
1640 * @out: a pointer to an array of bytes to store the result
1641 * @outlen: the length of @out
1642 * @in: a pointer to an array of ISO Latin 1 chars
1643 * @inlen: the length of @in
1644 *
1645 * Returns 0 if success, or
1646 * -1 by lack of space, or
1647 * -2 if the transcoding fails (for *in is not valid utf8 string or
1648 * the result of transformation can't fit into the encoding we want), or
1649 * -3 if there the last byte can't form a single output char.
1650 *
1651 * The value of @inlen after return is the number of octets consumed
1652 * as the return value is positive, else unpredictiable.
1653 * The value of @outlen after return is the number of ocetes consumed.
1654 */
1655static int
1656xmlIconvWrapper(iconv_t cd,
1657 unsigned char *out, int *outlen,
1658 const unsigned char *in, int *inlen) {
1659
1660 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1661 const char *icv_in = (const char *) in;
1662 char *icv_out = (char *) out;
1663 int ret;
1664
1665 ret = iconv(cd,
1666 &icv_in, &icv_inlen,
1667 &icv_out, &icv_outlen);
Daniel Veillardbe803962000-06-28 23:40:59 +00001668 if (in != NULL) {
1669 *inlen -= icv_inlen;
1670 *outlen -= icv_outlen;
1671 } else {
1672 *inlen = 0;
1673 *outlen = 0;
1674 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001675 if (icv_inlen != 0 || ret == (size_t) -1) {
1676#ifdef EILSEQ
1677 if (errno == EILSEQ) {
1678 return -2;
1679 } else
1680#endif
1681#ifdef E2BIG
1682 if (errno == E2BIG) {
1683 return -1;
1684 } else
1685#endif
1686#ifdef EINVAL
1687 if (errno == EINVAL) {
1688 return -3;
Daniel Veillardf62ceff2000-11-24 23:36:01 +00001689 } else
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001690#endif
Daniel Veillardf62ceff2000-11-24 23:36:01 +00001691 {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001692 return -3;
1693 }
1694 }
1695 return 0;
1696}
1697#endif /* LIBXML_ICONV_ENABLED */
1698
1699/**
Daniel Veillardbe803962000-06-28 23:40:59 +00001700 * xmlCharEncFirstLine:
1701 * @handler: char enconding transformation data structure
1702 * @out: an xmlBuffer for the output.
1703 * @in: an xmlBuffer for the input
1704 *
1705 * Front-end for the encoding handler input function, but handle only
1706 * the very first line, i.e. limit itself to 45 chars.
1707 *
1708 * Returns the number of byte written if success, or
1709 * -1 general error
1710 * -2 if the transcoding fails (for *in is not valid utf8 string or
1711 * the result of transformation can't fit into the encoding we want), or
1712 */
1713int
1714xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1715 xmlBufferPtr in) {
1716 int ret = -2;
1717 int written;
1718 int toconv;
1719
1720 if (handler == NULL) return(-1);
1721 if (out == NULL) return(-1);
1722 if (in == NULL) return(-1);
1723
1724 written = out->size - out->use;
1725 toconv = in->use;
1726 if (toconv * 2 >= written) {
1727 xmlBufferGrow(out, toconv);
1728 written = out->size - out->use - 1;
1729 }
1730
1731 /*
1732 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1733 * 45 chars should be sufficient to reach the end of the encoding
1734 * decalration without going too far inside the document content.
1735 */
1736 written = 45;
1737
1738 if (handler->input != NULL) {
1739 ret = handler->input(&out->content[out->use], &written,
1740 in->content, &toconv);
1741 xmlBufferShrink(in, toconv);
1742 out->use += written;
1743 out->content[out->use] = 0;
1744 }
1745#ifdef LIBXML_ICONV_ENABLED
1746 else if (handler->iconv_in != NULL) {
1747 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1748 &written, in->content, &toconv);
1749 xmlBufferShrink(in, toconv);
1750 out->use += written;
1751 out->content[out->use] = 0;
1752 if (ret == -1) ret = -3;
1753 }
1754#endif /* LIBXML_ICONV_ENABLED */
1755#ifdef DEBUG_ENCODING
1756 switch (ret) {
1757 case 0:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001758 xmlGenericError(xmlGenericErrorContext,
1759 "converted %d bytes to %d bytes of input\n",
Daniel Veillardbe803962000-06-28 23:40:59 +00001760 toconv, written);
1761 break;
1762 case -1:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001763 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
Daniel Veillardbe803962000-06-28 23:40:59 +00001764 toconv, written, in->use);
1765 break;
1766 case -2:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001767 xmlGenericError(xmlGenericErrorContext,
1768 "input conversion failed due to input error\n");
Daniel Veillardbe803962000-06-28 23:40:59 +00001769 break;
1770 case -3:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001771 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
Daniel Veillardbe803962000-06-28 23:40:59 +00001772 toconv, written, in->use);
1773 break;
1774 default:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001775 xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
Daniel Veillardbe803962000-06-28 23:40:59 +00001776 }
1777#endif
1778 /*
1779 * Ignore when input buffer is not on a boundary
1780 */
1781 if (ret == -3) ret = 0;
1782 if (ret == -1) ret = 0;
1783 return(ret);
1784}
1785
1786/**
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001787 * xmlCharEncInFunc:
1788 * @handler: char enconding transformation data structure
1789 * @out: an xmlBuffer for the output.
1790 * @in: an xmlBuffer for the input
1791 *
1792 * Generic front-end for the encoding handler input function
1793 *
1794 * Returns the number of byte written if success, or
1795 * -1 general error
1796 * -2 if the transcoding fails (for *in is not valid utf8 string or
1797 * the result of transformation can't fit into the encoding we want), or
1798 */
1799int
1800xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1801 xmlBufferPtr in) {
1802 int ret = -2;
1803 int written;
1804 int toconv;
1805
1806 if (handler == NULL) return(-1);
1807 if (out == NULL) return(-1);
1808 if (in == NULL) return(-1);
1809
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001810 toconv = in->use;
Daniel Veillard87b95392000-08-12 21:12:04 +00001811 if (toconv == 0)
1812 return(0);
1813 written = out->size - out->use;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001814 if (toconv * 2 >= written) {
Daniel Veillarde2488192001-01-04 10:54:22 +00001815 xmlBufferGrow(out, out->size + toconv * 2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001816 written = out->size - out->use - 1;
1817 }
1818 if (handler->input != NULL) {
1819 ret = handler->input(&out->content[out->use], &written,
1820 in->content, &toconv);
1821 xmlBufferShrink(in, toconv);
1822 out->use += written;
1823 out->content[out->use] = 0;
1824 }
1825#ifdef LIBXML_ICONV_ENABLED
1826 else if (handler->iconv_in != NULL) {
1827 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1828 &written, in->content, &toconv);
1829 xmlBufferShrink(in, toconv);
1830 out->use += written;
1831 out->content[out->use] = 0;
1832 if (ret == -1) ret = -3;
1833 }
1834#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001835 switch (ret) {
Daniel Veillardbe803962000-06-28 23:40:59 +00001836#ifdef DEBUG_ENCODING
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001837 case 0:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001838 xmlGenericError(xmlGenericErrorContext,
1839 "converted %d bytes to %d bytes of input\n",
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001840 toconv, written);
1841 break;
1842 case -1:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001843 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001844 toconv, written, in->use);
1845 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001846 case -3:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001847 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001848 toconv, written, in->use);
1849 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001850#endif
Daniel Veillardbe803962000-06-28 23:40:59 +00001851 case -2:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001852 xmlGenericError(xmlGenericErrorContext,
1853 "input conversion failed due to input error\n");
1854 xmlGenericError(xmlGenericErrorContext,
1855 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Daniel Veillardbe803962000-06-28 23:40:59 +00001856 in->content[0], in->content[1],
1857 in->content[2], in->content[3]);
1858 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001859 /*
1860 * Ignore when input buffer is not on a boundary
1861 */
1862 if (ret == -3) ret = 0;
1863 return(ret);
1864}
1865
1866/**
1867 * xmlCharEncOutFunc:
1868 * @handler: char enconding transformation data structure
1869 * @out: an xmlBuffer for the output.
1870 * @in: an xmlBuffer for the input
1871 *
1872 * Generic front-end for the encoding handler output function
Daniel Veillardbe803962000-06-28 23:40:59 +00001873 * a first call with @in == NULL has to be made firs to initiate the
1874 * output in case of non-stateless encoding needing to initiate their
1875 * state or the output (like the BOM in UTF16).
1876 * In case of UTF8 sequence conversion errors for the given encoder,
1877 * the content will be automatically remapped to a CharRef sequence.
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001878 *
1879 * Returns the number of byte written if success, or
1880 * -1 general error
1881 * -2 if the transcoding fails (for *in is not valid utf8 string or
1882 * the result of transformation can't fit into the encoding we want), or
1883 */
1884int
1885xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1886 xmlBufferPtr in) {
1887 int ret = -2;
1888 int written;
Daniel Veillarde2488192001-01-04 10:54:22 +00001889 int writtentot = 0;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001890 int toconv;
Daniel Veillardbe803962000-06-28 23:40:59 +00001891 int output = 0;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001892
1893 if (handler == NULL) return(-1);
1894 if (out == NULL) return(-1);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001895
Daniel Veillardbe803962000-06-28 23:40:59 +00001896retry:
1897
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001898 written = out->size - out->use;
Daniel Veillardbe803962000-06-28 23:40:59 +00001899
1900 /*
1901 * First specific handling of in = NULL, i.e. the initialization call
1902 */
1903 if (in == NULL) {
1904 toconv = 0;
1905 if (handler->output != NULL) {
1906 ret = handler->output(&out->content[out->use], &written,
1907 NULL, &toconv);
1908 out->use += written;
1909 out->content[out->use] = 0;
1910 }
1911#ifdef LIBXML_ICONV_ENABLED
1912 else if (handler->iconv_out != NULL) {
1913 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1914 &written, NULL, &toconv);
1915 out->use += written;
1916 out->content[out->use] = 0;
1917 }
1918#endif /* LIBXML_ICONV_ENABLED */
1919#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001920 xmlGenericError(xmlGenericErrorContext,
1921 "initialized encoder\n");
Daniel Veillardbe803962000-06-28 23:40:59 +00001922#endif
1923 return(0);
1924 }
1925
1926 /*
1927 * Convertion itself.
1928 */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001929 toconv = in->use;
Daniel Veillard87b95392000-08-12 21:12:04 +00001930 if (toconv == 0)
1931 return(0);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001932 if (toconv * 2 >= written) {
1933 xmlBufferGrow(out, toconv * 2);
1934 written = out->size - out->use - 1;
1935 }
1936 if (handler->output != NULL) {
1937 ret = handler->output(&out->content[out->use], &written,
Daniel Veillardbe803962000-06-28 23:40:59 +00001938 in->content, &toconv);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001939 xmlBufferShrink(in, toconv);
1940 out->use += written;
Daniel Veillarde2488192001-01-04 10:54:22 +00001941 writtentot += written;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001942 out->content[out->use] = 0;
1943 }
1944#ifdef LIBXML_ICONV_ENABLED
1945 else if (handler->iconv_out != NULL) {
1946 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1947 &written, in->content, &toconv);
1948 xmlBufferShrink(in, toconv);
1949 out->use += written;
Daniel Veillarde2488192001-01-04 10:54:22 +00001950 writtentot += written;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001951 out->content[out->use] = 0;
Daniel Veillarde2488192001-01-04 10:54:22 +00001952 if (ret == -1) {
1953 if (written > 0) {
1954 /*
1955 * Can be a limitation of iconv
1956 */
1957 goto retry;
1958 }
1959 ret = -3;
1960 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001961 }
1962#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001963 else {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001964 xmlGenericError(xmlGenericErrorContext,
1965 "xmlCharEncOutFunc: no output function !\n");
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001966 return(-1);
1967 }
Daniel Veillardbe803962000-06-28 23:40:59 +00001968
1969 if (ret >= 0) output += ret;
1970
1971 /*
1972 * Attempt to handle error cases
1973 */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001974 switch (ret) {
Daniel Veillardbe803962000-06-28 23:40:59 +00001975#ifdef DEBUG_ENCODING
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001976 case 0:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001977 xmlGenericError(xmlGenericErrorContext,
1978 "converted %d bytes to %d bytes of output\n",
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001979 toconv, written);
1980 break;
1981 case -1:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001982 xmlGenericError(xmlGenericErrorContext,
1983 "output conversion failed by lack of space\n");
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001984 break;
Daniel Veillarde2488192001-01-04 10:54:22 +00001985#endif
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001986 case -3:
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00001987 xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001988 toconv, written, in->use);
1989 break;
Daniel Veillardbe803962000-06-28 23:40:59 +00001990 case -2: {
1991 int len = in->use;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001992 const xmlChar *utf = (const xmlChar *) in->content;
Daniel Veillardbe803962000-06-28 23:40:59 +00001993 int cur;
1994
1995 cur = xmlGetUTF8Char(utf, &len);
1996 if (cur > 0) {
1997 xmlChar charref[20];
1998
1999#ifdef DEBUG_ENCODING
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002000 xmlGenericError(xmlGenericErrorContext,
2001 "handling output conversion error\n");
2002 xmlGenericError(xmlGenericErrorContext,
2003 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Daniel Veillardbe803962000-06-28 23:40:59 +00002004 in->content[0], in->content[1],
2005 in->content[2], in->content[3]);
2006#endif
2007 /*
2008 * Removes the UTF8 sequence, and replace it by a charref
2009 * and continue the transcoding phase, hoping the error
2010 * did not mangle the encoder state.
2011 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00002012 sprintf((char *) charref, "&#x%X;", cur);
Daniel Veillardbe803962000-06-28 23:40:59 +00002013 xmlBufferShrink(in, len);
2014 xmlBufferAddHead(in, charref, -1);
2015
2016 goto retry;
2017 } else {
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002018 xmlGenericError(xmlGenericErrorContext,
2019 "output conversion failed due to conv error\n");
2020 xmlGenericError(xmlGenericErrorContext,
2021 "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
Daniel Veillardbe803962000-06-28 23:40:59 +00002022 in->content[0], in->content[1],
2023 in->content[2], in->content[3]);
Daniel Veillard87b95392000-08-12 21:12:04 +00002024 in->content[0] = ' ';
Daniel Veillardbe803962000-06-28 23:40:59 +00002025 }
2026 break;
2027 }
2028 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00002029 return(ret);
2030}
2031
2032/**
2033 * xmlCharEncCloseFunc:
2034 * @handler: char enconding transformation data structure
2035 *
2036 * Generic front-end for hencoding handler close function
2037 *
2038 * Returns 0 if success, or -1 in case of error
2039 */
2040int
2041xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2042 int ret = 0;
2043 if (handler == NULL) return(-1);
2044 if (handler->name == NULL) return(-1);
2045#ifdef LIBXML_ICONV_ENABLED
2046 /*
2047 * Iconv handlers can be oused only once, free the whole block.
2048 * and the associated icon resources.
2049 */
2050 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2051 if (handler->name != NULL)
2052 xmlFree(handler->name);
2053 handler->name = NULL;
2054 if (handler->iconv_out != NULL) {
2055 if (iconv_close(handler->iconv_out))
2056 ret = -1;
2057 handler->iconv_out = NULL;
2058 }
2059 if (handler->iconv_in != NULL) {
2060 if (iconv_close(handler->iconv_in))
2061 ret = -1;
2062 handler->iconv_in = NULL;
2063 }
2064 xmlFree(handler);
2065 }
2066#endif /* LIBXML_ICONV_ENABLED */
2067#ifdef DEBUG_ENCODING
2068 if (ret)
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002069 xmlGenericError(xmlGenericErrorContext,
2070 "failed to close the encoding handler\n");
Daniel Veillard496a1cf2000-05-03 14:20:55 +00002071 else
Daniel Veillardd6d7f7b2000-10-25 19:56:55 +00002072 xmlGenericError(xmlGenericErrorContext,
2073 "closed the encoding handler\n");
Daniel Veillard496a1cf2000-05-03 14:20:55 +00002074
2075#endif
2076 return(ret);
2077}
2078