blob: 99818900229c3219c8a60899cf32f4a465f83b3d [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
Daniel Veillardbe803962000-06-28 23:40:59 +00006 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
Daniel Veillard891e4041998-10-19 00:43:02 +00007 * [ISO-10646] UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1] ISO Latin-1 characters codes.
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10 * Worldwide Character Encoding -- Version 1.0", Addison-
11 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12 * described in Unicode Technical Report #4.
13 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14 * Information Interchange, ANSI X3.4-1986.
15 *
Daniel Veillard14fff061999-06-22 21:49:07 +000016 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard891e4041998-10-19 00:43:02 +000017 *
18 * See Copyright for the status of this software.
19 *
Daniel Veillard891e4041998-10-19 00:43:02 +000020 * Daniel.Veillard@w3.org
21 */
22
Daniel Veillard3c558c31999-12-22 11:30:41 +000023#ifdef WIN32
24#include "win32config.h"
25#else
Daniel Veillardb96e6431999-08-29 21:02:19 +000026#include "config.h"
Daniel Veillard7f7d1111999-09-22 09:46:25 +000027#endif
28
Daniel Veillard14fff061999-06-22 21:49:07 +000029#include <stdio.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000030#include <string.h>
31
32#ifdef HAVE_CTYPE_H
33#include <ctype.h>
34#endif
Daniel Veillard6d3bf1f1999-12-16 17:52:19 +000035#ifdef HAVE_STDLIB_H
36#include <stdlib.h>
37#endif
Daniel Veillard496a1cf2000-05-03 14:20:55 +000038#include <libxml/xmlversion.h>
39#ifdef LIBXML_ICONV_ENABLED
40#ifdef HAVE_ERRNO_H
41#include <errno.h>
42#endif
43#endif
Daniel Veillard361d8452000-04-03 19:48:13 +000044#include <libxml/encoding.h>
45#include <libxml/xmlmemory.h>
Daniel Veillard32bc74e2000-07-14 14:49:25 +000046#ifdef LIBXML_HTML_ENABLED
47#include <libxml/HTMLparser.h>
48#endif
Daniel Veillard891e4041998-10-19 00:43:02 +000049
Daniel Veillardcf461992000-03-14 18:30:20 +000050xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Daniel Veillardb05deb71999-08-10 19:04:08 +000052
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +000053typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
54typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
55struct _xmlCharEncodingAlias {
56 const char *name;
57 const char *alias;
58};
59
60static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
61static int xmlCharEncodingAliasesNb = 0;
62static int xmlCharEncodingAliasesMax = 0;
63
Daniel Veillard496a1cf2000-05-03 14:20:55 +000064#ifdef LIBXML_ICONV_ENABLED
65#if 0
66#define DEBUG_ENCODING /* Define this to get encoding traces */
67#endif
68#endif
69
70static int xmlLittleEndian = 1;
71
Daniel Veillard0ba4d531998-11-01 19:34:31 +000072/*
73 * From rfc2044: encoding of the Unicode values on UTF-8:
74 *
75 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
76 * 0000 0000-0000 007F 0xxxxxxx
77 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
78 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
79 *
80 * I hope we won't use values > 0xFFFF anytime soon !
81 */
82
Daniel Veillard97b58771998-10-20 06:14:16 +000083/**
Daniel Veillardbe803962000-06-28 23:40:59 +000084 * xmlGetUTF8Char:
85 * @utf: a sequence of UTF-8 encoded bytes
86 * @len: a pointer to @bytes len
87 *
88 * Read one UTF8 Char from @utf
89 *
90 * Returns the char value or -1 in case of error and update @len with the
91 * number of bytes used
92 */
93int
94xmlGetUTF8Char(const unsigned char *utf, int *len) {
95 unsigned int c;
96
97 if (utf == NULL)
98 goto error;
99 if (len == NULL)
100 goto error;
101 if (*len < 1)
102 goto error;
103
104 c = utf[0];
105 if (c & 0x80) {
106 if (*len < 2)
107 goto error;
108 if ((utf[1] & 0xc0) != 0x80)
109 goto error;
110 if ((c & 0xe0) == 0xe0) {
111 if (*len < 3)
112 goto error;
113 if ((utf[2] & 0xc0) != 0x80)
114 goto error;
115 if ((c & 0xf0) == 0xf0) {
116 if (*len < 4)
117 goto error;
118 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
119 goto error;
120 *len = 4;
121 /* 4-byte code */
122 c = (utf[0] & 0x7) << 18;
123 c |= (utf[1] & 0x3f) << 12;
124 c |= (utf[2] & 0x3f) << 6;
125 c |= utf[3] & 0x3f;
126 } else {
127 /* 3-byte code */
128 *len = 3;
129 c = (utf[0] & 0xf) << 12;
130 c |= (utf[1] & 0x3f) << 6;
131 c |= utf[2] & 0x3f;
132 }
133 } else {
134 /* 2-byte code */
135 *len = 2;
136 c = (utf[0] & 0x1f) << 6;
137 c |= utf[1] & 0x3f;
138 }
139 } else {
140 /* 1-byte code */
141 *len = 1;
142 }
143 return(c);
144
145error:
146 *len = 0;
147 return(-1);
148}
149
150/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000151 * xmlCheckUTF8: Check utf-8 string for legality.
152 * @utf: Pointer to putative utf-8 encoded string.
153 *
154 * Checks @utf for being valid utf-8. @utf is assumed to be
155 * null-terminated. This function is not super-strict, as it will
156 * allow longer utf-8 sequences than necessary. Note that Java is
157 * capable of producing these sequences if provoked. Also note, this
158 * routine checks for the 4-byte maxiumum size, but does not check for
159 * 0x10ffff maximum value.
160 *
161 * Return value: true if @utf is valid.
162 **/
163int
164xmlCheckUTF8(const unsigned char *utf)
165{
166 int ix;
167 unsigned char c;
168
169 for (ix = 0; (c = utf[ix]);) {
170 if (c & 0x80) {
171 if ((utf[ix + 1] & 0xc0) != 0x80)
172 return(0);
173 if ((c & 0xe0) == 0xe0) {
174 if ((utf[ix + 2] & 0xc0) != 0x80)
175 return(0);
176 if ((c & 0xf0) == 0xf0) {
177 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
178 return(0);
179 ix += 4;
180 /* 4-byte code */
181 } else
182 /* 3-byte code */
183 ix += 3;
184 } else
185 /* 2-byte code */
186 ix += 2;
187 } else
188 /* 1-byte code */
189 ix++;
190 }
191 return(1);
192}
193
194/**
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000195 * asciiToUTF8:
196 * @out: a pointer to an array of bytes to store the result
197 * @outlen: the length of @out
198 * @in: a pointer to an array of ASCII chars
199 * @inlen: the length of @in
200 *
201 * Take a block of ASCII chars in and try to convert it to an UTF-8
202 * block of chars out.
203 * Returns 0 if success, or -1 otherwise
204 * The value of @inlen after return is the number of octets consumed
205 * as the return value is positive, else unpredictiable.
206 * The value of @outlen after return is the number of ocetes consumed.
207 */
208int
209asciiToUTF8(unsigned char* out, int *outlen,
210 const unsigned char* in, int *inlen) {
211 unsigned char* outstart = out;
212 const unsigned char* base = in;
213 const unsigned char* processed = in;
214 unsigned char* outend = out + *outlen;
215 const unsigned char* inend;
216 unsigned int c;
217 int bits;
218
219 inend = in + (*inlen);
220 while ((in < inend) && (out - outstart + 5 < *outlen)) {
221 c= *in++;
222
223 /* assertion: c is a single UTF-4 value */
224 if (out >= outend)
225 break;
226 if (c < 0x80) { *out++= c; bits= -6; }
227 else {
228 *outlen = out - outstart;
229 *inlen = processed - base;
230 return(-1);
231 }
232
233 for ( ; bits >= 0; bits-= 6) {
234 if (out >= outend)
235 break;
236 *out++= ((c >> bits) & 0x3F) | 0x80;
237 }
238 processed = (const unsigned char*) in;
239 }
240 *outlen = out - outstart;
241 *inlen = processed - base;
242 return(0);
243}
244
245/**
246 * UTF8Toascii:
247 * @out: a pointer to an array of bytes to store the result
248 * @outlen: the length of @out
249 * @in: a pointer to an array of UTF-8 chars
250 * @inlen: the length of @in
251 *
252 * Take a block of UTF-8 chars in and try to convert it to an ASCII
253 * block of chars out.
254 *
255 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
256 * The value of @inlen after return is the number of octets consumed
257 * as the return value is positive, else unpredictiable.
258 * The value of @outlen after return is the number of ocetes consumed.
259 */
260int
261UTF8Toascii(unsigned char* out, int *outlen,
262 const unsigned char* in, int *inlen) {
263 const unsigned char* processed = in;
264 const unsigned char* outend;
265 const unsigned char* outstart = out;
266 const unsigned char* instart = in;
267 const unsigned char* inend;
268 unsigned int c, d;
269 int trailing;
270
271 if (in == NULL) {
272 /*
273 * initialization nothing to do
274 */
275 *outlen = 0;
276 *inlen = 0;
277 return(0);
278 }
279 inend = in + (*inlen);
280 outend = out + (*outlen);
281 while (in < inend) {
282 d = *in++;
283 if (d < 0x80) { c= d; trailing= 0; }
284 else if (d < 0xC0) {
285 /* trailing byte in leading position */
286 *outlen = out - outstart;
287 *inlen = processed - instart;
288 return(-2);
289 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
290 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
291 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
292 else {
293 /* no chance for this in Ascii */
294 *outlen = out - outstart;
295 *inlen = processed - instart;
296 return(-2);
297 }
298
299 if (inend - in < trailing) {
300 break;
301 }
302
303 for ( ; trailing; trailing--) {
304 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
305 break;
306 c <<= 6;
307 c |= d & 0x3F;
308 }
309
310 /* assertion: c is a single UTF-4 value */
311 if (c < 0x80) {
312 if (out >= outend)
313 break;
314 *out++ = c;
315 } else {
316 /* no chance for this in Ascii */
317 *outlen = out - outstart;
318 *inlen = processed - instart;
319 return(-2);
320 }
321 processed = in;
322 }
323 *outlen = out - outstart;
324 *inlen = processed - instart;
325 return(0);
326}
327
328/**
Daniel Veillard97b58771998-10-20 06:14:16 +0000329 * isolat1ToUTF8:
Daniel Veillard7f858501999-11-17 17:32:38 +0000330 * @out: a pointer to an array of bytes to store the result
331 * @outlen: the length of @out
332 * @in: a pointer to an array of ISO Latin 1 chars
333 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000334 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000335 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
336 * block of chars out.
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000337 * Returns 0 if success, or -1 otherwise
338 * The value of @inlen after return is the number of octets consumed
339 * as the return value is positive, else unpredictiable.
340 * The value of @outlen after return is the number of ocetes consumed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000341 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000342int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000343isolat1ToUTF8(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000344 const unsigned char* in, int *inlen) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000345 unsigned char* outstart = out;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000346 const unsigned char* base = in;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000347 const unsigned char* processed = in;
348 unsigned char* outend = out + *outlen;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000349 const unsigned char* inend;
350 unsigned int c;
351 int bits;
Daniel Veillard891e4041998-10-19 00:43:02 +0000352
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000353 inend = in + (*inlen);
354 while ((in < inend) && (out - outstart + 5 < *outlen)) {
355 c= *in++;
356
357 /* assertion: c is a single UTF-4 value */
358 if (out >= outend)
359 break;
360 if (c < 0x80) { *out++= c; bits= -6; }
361 else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
362
363 for ( ; bits >= 0; bits-= 6) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000364 if (out >= outend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000365 break;
366 *out++= ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard891e4041998-10-19 00:43:02 +0000367 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000368 processed = (const unsigned char*) in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000369 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000370 *outlen = out - outstart;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000371 *inlen = processed - base;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000372 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000373}
374
Daniel Veillard97b58771998-10-20 06:14:16 +0000375/**
376 * UTF8Toisolat1:
Daniel Veillard7f858501999-11-17 17:32:38 +0000377 * @out: a pointer to an array of bytes to store the result
378 * @outlen: the length of @out
379 * @in: a pointer to an array of UTF-8 chars
380 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000381 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000382 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
383 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000384 *
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000385 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
Daniel Veillardcf461992000-03-14 18:30:20 +0000386 * The value of @inlen after return is the number of octets consumed
387 * as the return value is positive, else unpredictiable.
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000388 * The value of @outlen after return is the number of ocetes consumed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000389 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000390int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000391UTF8Toisolat1(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000392 const unsigned char* in, int *inlen) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000393 const unsigned char* processed = in;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000394 const unsigned char* outend;
395 const unsigned char* outstart = out;
396 const unsigned char* instart = in;
397 const unsigned char* inend;
398 unsigned int c, d;
399 int trailing;
Daniel Veillard891e4041998-10-19 00:43:02 +0000400
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000401 if (in == NULL) {
402 /*
403 * initialization nothing to do
404 */
405 *outlen = 0;
406 *inlen = 0;
407 return(0);
408 }
409 inend = in + (*inlen);
410 outend = out + (*outlen);
Daniel Veillard891e4041998-10-19 00:43:02 +0000411 while (in < inend) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000412 d = *in++;
413 if (d < 0x80) { c= d; trailing= 0; }
414 else if (d < 0xC0) {
415 /* trailing byte in leading position */
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000416 *outlen = out - outstart;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000417 *inlen = processed - instart;
418 return(-2);
419 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
420 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
421 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
422 else {
423 /* no chance for this in IsoLat1 */
424 *outlen = out - outstart;
425 *inlen = processed - instart;
426 return(-2);
427 }
428
429 if (inend - in < trailing) {
430 break;
431 }
432
433 for ( ; trailing; trailing--) {
Daniel Veillard87b95392000-08-12 21:12:04 +0000434 if (in >= inend)
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000435 break;
Daniel Veillard87b95392000-08-12 21:12:04 +0000436 if (((d= *in++) & 0xC0) != 0x80) {
437 *outlen = out - outstart;
438 *inlen = processed - instart;
439 return(-2);
440 }
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000441 c <<= 6;
442 c |= d & 0x3F;
443 }
444
445 /* assertion: c is a single UTF-4 value */
446 if (c <= 0xFF) {
447 if (out >= outend)
448 break;
449 *out++ = c;
450 } else {
451 /* no chance for this in IsoLat1 */
452 *outlen = out - outstart;
453 *inlen = processed - instart;
Daniel Veillardcf461992000-03-14 18:30:20 +0000454 return(-2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000455 }
456 processed = in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000457 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000458 *outlen = out - outstart;
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000459 *inlen = processed - instart;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000460 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000461}
462
Daniel Veillard97b58771998-10-20 06:14:16 +0000463/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000464 * UTF16LEToUTF8:
Daniel Veillard7f858501999-11-17 17:32:38 +0000465 * @out: a pointer to an array of bytes to store the result
466 * @outlen: the length of @out
Daniel Veillardcf461992000-03-14 18:30:20 +0000467 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
468 * @inlenb: the length of @in in UTF-16LE chars
Daniel Veillard97b58771998-10-20 06:14:16 +0000469 *
Daniel Veillardcf461992000-03-14 18:30:20 +0000470 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
471 * block of chars out. This function assume the endian properity
472 * is the same between the native type of this machine and the
473 * inputed one.
474 *
475 * Returns the number of byte written, or -1 by lack of space, or -2
476 * if the transcoding fails (for *in is not valid utf16 string)
477 * The value of *inlen after return is the number of octets consumed
478 * as the return value is positive, else unpredictiable.
Daniel Veillard891e4041998-10-19 00:43:02 +0000479 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000480int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000481UTF16LEToUTF8(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000482 const unsigned char* inb, int *inlenb)
Daniel Veillard891e4041998-10-19 00:43:02 +0000483{
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000484 unsigned char* outstart = out;
485 const unsigned char* processed = inb;
486 unsigned char* outend = out + *outlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000487 unsigned short* in = (unsigned short*) inb;
488 unsigned short* inend;
489 unsigned int c, d, inlen;
490 unsigned char *tmp;
Daniel Veillard891e4041998-10-19 00:43:02 +0000491 int bits;
492
Daniel Veillardcf461992000-03-14 18:30:20 +0000493 if ((*inlenb % 2) == 1)
494 (*inlenb)--;
495 inlen = *inlenb / 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000496 inend = in + inlen;
Daniel Veillardbe803962000-06-28 23:40:59 +0000497 while ((in < inend) && (out - outstart + 5 < *outlen)) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000498 if (xmlLittleEndian) {
499 c= *in++;
500 } else {
501 tmp = (unsigned char *) in;
502 c = *tmp++;
503 c = c | (((unsigned int)*tmp) << 8);
504 in++;
505 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000506 if ((c & 0xFC00) == 0xD800) { /* surrogates */
Daniel Veillardbe803962000-06-28 23:40:59 +0000507 if (in >= inend) { /* (in > inend) shouldn't happens */
508 break;
509 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000510 if (xmlLittleEndian) {
511 d = *in++;
512 } else {
513 tmp = (unsigned char *) in;
514 d = *tmp++;
515 d = d | (((unsigned int)*tmp) << 8);
516 in++;
517 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000518 if ((d & 0xFC00) == 0xDC00) {
Daniel Veillard891e4041998-10-19 00:43:02 +0000519 c &= 0x03FF;
520 c <<= 10;
521 c |= d & 0x03FF;
522 c += 0x10000;
523 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000524 else {
525 *outlen = out - outstart;
526 *inlenb = processed - inb;
Daniel Veillardcf461992000-03-14 18:30:20 +0000527 return(-2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000528 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000529 }
530
Daniel Veillardcf461992000-03-14 18:30:20 +0000531 /* assertion: c is a single UTF-4 value */
532 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000533 break;
Daniel Veillard891e4041998-10-19 00:43:02 +0000534 if (c < 0x80) { *out++= c; bits= -6; }
Daniel Veillardcf461992000-03-14 18:30:20 +0000535 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
536 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
537 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillard891e4041998-10-19 00:43:02 +0000538
Daniel Veillardcf461992000-03-14 18:30:20 +0000539 for ( ; bits >= 0; bits-= 6) {
540 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000541 break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000542 *out++= ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard891e4041998-10-19 00:43:02 +0000543 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000544 processed = (const unsigned char*) in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000545 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000546 *outlen = out - outstart;
547 *inlenb = processed - inb;
548 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000549}
550
Daniel Veillard97b58771998-10-20 06:14:16 +0000551/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000552 * UTF8ToUTF16LE:
553 * @outb: a pointer to an array of bytes to store the result
554 * @outlen: the length of @outb
Daniel Veillard7f858501999-11-17 17:32:38 +0000555 * @in: a pointer to an array of UTF-8 chars
556 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000557 *
Daniel Veillardcf461992000-03-14 18:30:20 +0000558 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
Daniel Veillard891e4041998-10-19 00:43:02 +0000559 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000560 *
Daniel Veillard1e346af1999-02-22 10:33:01 +0000561 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillardcf461992000-03-14 18:30:20 +0000562 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000563 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000564int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000565UTF8ToUTF16LE(unsigned char* outb, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000566 const unsigned char* in, int *inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000567{
Daniel Veillardcf461992000-03-14 18:30:20 +0000568 unsigned short* out = (unsigned short*) outb;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000569 const unsigned char* processed = in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000570 unsigned short* outstart= out;
Daniel Veillardcf461992000-03-14 18:30:20 +0000571 unsigned short* outend;
572 const unsigned char* inend= in+*inlen;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000573 unsigned int c, d;
574 int trailing;
Daniel Veillardcf461992000-03-14 18:30:20 +0000575 unsigned char *tmp;
576 unsigned short tmp1, tmp2;
Daniel Veillard891e4041998-10-19 00:43:02 +0000577
Daniel Veillardbe803962000-06-28 23:40:59 +0000578 if (in == NULL) {
579 /*
580 * initialization, add the Byte Order Mark
581 */
582 if (*outlen >= 2) {
583 outb[0] = 0xFF;
584 outb[1] = 0xFE;
585 *outlen = 2;
586 *inlen = 0;
587#ifdef DEBUG_ENCODING
588 fprintf(stderr, "Added FFFE Byte Order Mark\n");
589#endif
590 return(2);
591 }
592 *outlen = 0;
593 *inlen = 0;
594 return(0);
595 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000596 outend = out + (*outlen / 2);
Daniel Veillard891e4041998-10-19 00:43:02 +0000597 while (in < inend) {
598 d= *in++;
599 if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000600 else if (d < 0xC0) {
601 /* trailing byte in leading position */
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000602 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000603 *inlen = processed - in;
604 return(-2);
605 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
Daniel Veillard891e4041998-10-19 00:43:02 +0000606 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
607 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000608 else {
609 /* no chance for this in UTF-16 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +0000610 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000611 *inlen = processed - in;
612 return(-2);
613 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000614
615 if (inend - in < trailing) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000616 break;
617 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000618
619 for ( ; trailing; trailing--) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000620 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000621 break;
Daniel Veillard891e4041998-10-19 00:43:02 +0000622 c <<= 6;
623 c |= d & 0x3F;
624 }
625
626 /* assertion: c is a single UTF-4 value */
627 if (c < 0x10000) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000628 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000629 break;
630 if (xmlLittleEndian) {
631 *out++ = c;
632 } else {
633 tmp = (unsigned char *) out;
634 *tmp = c ;
635 *(tmp + 1) = c >> 8 ;
636 out++;
637 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000638 }
639 else if (c < 0x110000) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000640 if (out+1 >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000641 break;
Daniel Veillard891e4041998-10-19 00:43:02 +0000642 c -= 0x10000;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000643 if (xmlLittleEndian) {
644 *out++ = 0xD800 | (c >> 10);
645 *out++ = 0xDC00 | (c & 0x03FF);
646 } else {
647 tmp1 = 0xD800 | (c >> 10);
648 tmp = (unsigned char *) out;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000649 *tmp = (unsigned char) tmp1;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000650 *(tmp + 1) = tmp1 >> 8;
651 out++;
Daniel Veillardcf461992000-03-14 18:30:20 +0000652
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000653 tmp2 = 0xDC00 | (c & 0x03FF);
654 tmp = (unsigned char *) out;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000655 *tmp = (unsigned char) tmp2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000656 *(tmp + 1) = tmp2 >> 8;
657 out++;
658 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000659 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000660 else
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000661 break;
662 processed = in;
Daniel Veillard891e4041998-10-19 00:43:02 +0000663 }
Daniel Veillardbe803962000-06-28 23:40:59 +0000664 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000665 *inlen = processed - in;
666 return(0);
Daniel Veillard891e4041998-10-19 00:43:02 +0000667}
668
Daniel Veillardcf461992000-03-14 18:30:20 +0000669/**
670 * UTF16BEToUTF8:
671 * @out: a pointer to an array of bytes to store the result
672 * @outlen: the length of @out
673 * @inb: a pointer to an array of UTF-16 passwd as a byte array
674 * @inlenb: the length of @in in UTF-16 chars
675 *
676 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
677 * block of chars out. This function assume the endian properity
678 * is the same between the native type of this machine and the
679 * inputed one.
680 *
681 * Returns the number of byte written, or -1 by lack of space, or -2
682 * if the transcoding fails (for *in is not valid utf16 string)
683 * The value of *inlen after return is the number of octets consumed
684 * as the return value is positive, else unpredictiable.
685 */
686int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000687UTF16BEToUTF8(unsigned char* out, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000688 const unsigned char* inb, int *inlenb)
689{
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000690 unsigned char* outstart = out;
691 const unsigned char* processed = inb;
692 unsigned char* outend = out + *outlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000693 unsigned short* in = (unsigned short*) inb;
694 unsigned short* inend;
695 unsigned int c, d, inlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000696 unsigned char *tmp;
Daniel Veillardcf461992000-03-14 18:30:20 +0000697 int bits;
698
699 if ((*inlenb % 2) == 1)
700 (*inlenb)--;
701 inlen = *inlenb / 2;
702 inend= in + inlen;
703 while (in < inend) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000704 if (xmlLittleEndian) {
705 tmp = (unsigned char *) in;
706 c = *tmp++;
707 c = c << 8;
708 c = c | (unsigned int) *tmp;
709 in++;
710 } else {
711 c= *in++;
712 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000713 if ((c & 0xFC00) == 0xD800) { /* surrogates */
714 if (in >= inend) { /* (in > inend) shouldn't happens */
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000715 *outlen = out - outstart;
716 *inlenb = processed - inb;
717 return(-2);
Daniel Veillardcf461992000-03-14 18:30:20 +0000718 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000719 if (xmlLittleEndian) {
720 tmp = (unsigned char *) in;
721 d = *tmp++;
722 d = d << 8;
723 d = d | (unsigned int) *tmp;
724 in++;
725 } else {
726 d= *in++;
727 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000728 if ((d & 0xFC00) == 0xDC00) {
729 c &= 0x03FF;
730 c <<= 10;
731 c |= d & 0x03FF;
732 c += 0x10000;
733 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000734 else {
735 *outlen = out - outstart;
736 *inlenb = processed - inb;
Daniel Veillardcf461992000-03-14 18:30:20 +0000737 return(-2);
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000738 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000739 }
740
741 /* assertion: c is a single UTF-4 value */
742 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000743 break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000744 if (c < 0x80) { *out++= c; bits= -6; }
745 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
746 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
747 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
748
749 for ( ; bits >= 0; bits-= 6) {
750 if (out >= outend)
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000751 break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000752 *out++= ((c >> bits) & 0x3F) | 0x80;
753 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000754 processed = (const unsigned char*) in;
Daniel Veillardcf461992000-03-14 18:30:20 +0000755 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000756 *outlen = out - outstart;
757 *inlenb = processed - inb;
758 return(0);
Daniel Veillardcf461992000-03-14 18:30:20 +0000759}
760
761/**
762 * UTF8ToUTF16BE:
763 * @outb: a pointer to an array of bytes to store the result
764 * @outlen: the length of @outb
765 * @in: a pointer to an array of UTF-8 chars
766 * @inlen: the length of @in
767 *
768 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
769 * block of chars out.
Daniel Veillardcf461992000-03-14 18:30:20 +0000770 *
771 * Returns the number of byte written, or -1 by lack of space, or -2
772 * if the transcoding failed.
773 */
774int
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000775UTF8ToUTF16BE(unsigned char* outb, int *outlen,
Daniel Veillardcf461992000-03-14 18:30:20 +0000776 const unsigned char* in, int *inlen)
777{
778 unsigned short* out = (unsigned short*) outb;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000779 const unsigned char* processed = in;
Daniel Veillardcf461992000-03-14 18:30:20 +0000780 unsigned short* outstart= out;
781 unsigned short* outend;
782 const unsigned char* inend= in+*inlen;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000783 unsigned int c, d;
784 int trailing;
Daniel Veillardcf461992000-03-14 18:30:20 +0000785 unsigned char *tmp;
786 unsigned short tmp1, tmp2;
Daniel Veillardcf461992000-03-14 18:30:20 +0000787
Daniel Veillardbe803962000-06-28 23:40:59 +0000788 if (in == NULL) {
789 /*
790 * initialization, add the Byte Order Mark
791 */
792 if (*outlen >= 2) {
793 outb[0] = 0xFE;
794 outb[1] = 0xFF;
795 *outlen = 2;
796 *inlen = 0;
797#ifdef DEBUG_ENCODING
798 fprintf(stderr, "Added FEFF Byte Order Mark\n");
799#endif
800 return(2);
801 }
802 *outlen = 0;
803 *inlen = 0;
804 return(0);
805 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000806 outend = out + (*outlen / 2);
Daniel Veillardcf461992000-03-14 18:30:20 +0000807 while (in < inend) {
808 d= *in++;
809 if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000810 else if (d < 0xC0) {
811 /* trailing byte in leading position */
812 *outlen = out - outstart;
813 *inlen = processed - in;
814 return(-2);
815 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
Daniel Veillardcf461992000-03-14 18:30:20 +0000816 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
817 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000818 else {
819 /* no chance for this in UTF-16 */
820 *outlen = out - outstart;
821 *inlen = processed - in;
822 return(-2);
823 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000824
825 if (inend - in < trailing) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000826 break;
827 }
828
829 for ( ; trailing; trailing--) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000830 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000831 c <<= 6;
832 c |= d & 0x3F;
833 }
834
835 /* assertion: c is a single UTF-4 value */
836 if (c < 0x10000) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000837 if (out >= outend) break;
838 if (xmlLittleEndian) {
839 tmp = (unsigned char *) out;
840 *tmp = c >> 8;
841 *(tmp + 1) = c;
842 out++;
843 } else {
844 *out++ = c;
845 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000846 }
847 else if (c < 0x110000) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000848 if (out+1 >= outend) break;
Daniel Veillardcf461992000-03-14 18:30:20 +0000849 c -= 0x10000;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000850 if (xmlLittleEndian) {
851 tmp1 = 0xD800 | (c >> 10);
852 tmp = (unsigned char *) out;
853 *tmp = tmp1 >> 8;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000854 *(tmp + 1) = (unsigned char) tmp1;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000855 out++;
Daniel Veillardcf461992000-03-14 18:30:20 +0000856
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000857 tmp2 = 0xDC00 | (c & 0x03FF);
858 tmp = (unsigned char *) out;
859 *tmp = tmp2 >> 8;
Daniel Veillard3f6f7f62000-06-30 17:58:25 +0000860 *(tmp + 1) = (unsigned char) tmp2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000861 out++;
862 } else {
863 *out++ = 0xD800 | (c >> 10);
864 *out++ = 0xDC00 | (c & 0x03FF);
865 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000866 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000867 else
868 break;
869 processed = in;
Daniel Veillardcf461992000-03-14 18:30:20 +0000870 }
Daniel Veillardbe803962000-06-28 23:40:59 +0000871 *outlen = (out - outstart) * 2;
Daniel Veillard496a1cf2000-05-03 14:20:55 +0000872 *inlen = processed - in;
873 return(0);
Daniel Veillardcf461992000-03-14 18:30:20 +0000874}
Daniel Veillard97b58771998-10-20 06:14:16 +0000875
Daniel Veillard27d88741999-05-29 11:51:49 +0000876/**
877 * xmlDetectCharEncoding:
878 * @in: a pointer to the first bytes of the XML entity, must be at least
879 * 4 bytes long.
Daniel Veillardcf461992000-03-14 18:30:20 +0000880 * @len: pointer to the length of the buffer
Daniel Veillard27d88741999-05-29 11:51:49 +0000881 *
882 * Guess the encoding of the entity using the first bytes of the entity content
883 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
884 *
885 * Returns one of the XML_CHAR_ENCODING_... values.
886 */
887xmlCharEncoding
Daniel Veillardcf461992000-03-14 18:30:20 +0000888xmlDetectCharEncoding(const unsigned char* in, int len)
Daniel Veillard27d88741999-05-29 11:51:49 +0000889{
Daniel Veillardcf461992000-03-14 18:30:20 +0000890 if (len >= 4) {
891 if ((in[0] == 0x00) && (in[1] == 0x00) &&
892 (in[2] == 0x00) && (in[3] == 0x3C))
893 return(XML_CHAR_ENCODING_UCS4BE);
894 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
895 (in[2] == 0x00) && (in[3] == 0x00))
896 return(XML_CHAR_ENCODING_UCS4LE);
897 if ((in[0] == 0x00) && (in[1] == 0x00) &&
898 (in[2] == 0x3C) && (in[3] == 0x00))
899 return(XML_CHAR_ENCODING_UCS4_2143);
900 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
901 (in[2] == 0x00) && (in[3] == 0x00))
902 return(XML_CHAR_ENCODING_UCS4_3412);
903 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
904 (in[2] == 0xA7) && (in[3] == 0x94))
905 return(XML_CHAR_ENCODING_EBCDIC);
906 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
907 (in[2] == 0x78) && (in[3] == 0x6D))
908 return(XML_CHAR_ENCODING_UTF8);
909 }
910 if (len >= 2) {
911 if ((in[0] == 0xFE) && (in[1] == 0xFF))
912 return(XML_CHAR_ENCODING_UTF16BE);
913 if ((in[0] == 0xFF) && (in[1] == 0xFE))
914 return(XML_CHAR_ENCODING_UTF16LE);
915 }
Daniel Veillard27d88741999-05-29 11:51:49 +0000916 return(XML_CHAR_ENCODING_NONE);
917}
918
919/**
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +0000920 * xmlCleanupEncodingAliases:
921 *
922 * Unregisters all aliases
923 */
924void
925xmlCleanupEncodingAliases(void) {
926 int i;
927
928 if (xmlCharEncodingAliases == NULL)
929 return;
930
931 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
932 if (xmlCharEncodingAliases[i].name != NULL)
933 xmlFree((char *) xmlCharEncodingAliases[i].name);
934 if (xmlCharEncodingAliases[i].alias != NULL)
935 xmlFree((char *) xmlCharEncodingAliases[i].alias);
936 }
937 xmlCharEncodingAliasesNb = 0;
938 xmlCharEncodingAliasesMax = 0;
939 xmlFree(xmlCharEncodingAliases);
940}
941
942/**
943 * xmlGetEncodingAlias:
944 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
945 *
946 * Lookup an encoding name for the given alias.
947 *
948 * Returns NULL if not found the original name otherwise
949 */
950const char *
951xmlGetEncodingAlias(const char *alias) {
952 int i;
953 char upper[100];
954
955 if (alias == NULL)
956 return(NULL);
957
958 if (xmlCharEncodingAliases == NULL)
959 return(NULL);
960
961 for (i = 0;i < 99;i++) {
962 upper[i] = toupper(alias[i]);
963 if (upper[i] == 0) break;
964 }
965 upper[i] = 0;
966
967 /*
968 * Walk down the list looking for a definition of the alias
969 */
970 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
971 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
972 return(xmlCharEncodingAliases[i].name);
973 }
974 }
975 return(NULL);
976}
977
978/**
979 * xmlAddEncodingAlias:
980 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
981 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
982 *
983 * Registers and alias @alias for an encoding named @name. Existing alias
984 * will be overwritten.
985 *
986 * Returns 0 in case of success, -1 in case of error
987 */
988int
989xmlAddEncodingAlias(const char *name, const char *alias) {
990 int i;
991 char upper[100];
992
993 if ((name == NULL) || (alias == NULL))
994 return(-1);
995
996 for (i = 0;i < 99;i++) {
997 upper[i] = toupper(alias[i]);
998 if (upper[i] == 0) break;
999 }
1000 upper[i] = 0;
1001
1002 if (xmlCharEncodingAliases == NULL) {
1003 xmlCharEncodingAliasesNb = 0;
1004 xmlCharEncodingAliasesMax = 20;
1005 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1006 xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1007 if (xmlCharEncodingAliases == NULL)
1008 return(-1);
1009 } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1010 xmlCharEncodingAliasesMax *= 2;
1011 xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1012 xmlRealloc(xmlCharEncodingAliases,
1013 xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1014 }
1015 /*
1016 * Walk down the list looking for a definition of the alias
1017 */
1018 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1019 if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1020 /*
1021 * Replace the definition.
1022 */
1023 xmlFree((char *) xmlCharEncodingAliases[i].name);
1024 xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1025 return(0);
1026 }
1027 }
1028 /*
1029 * Add the definition
1030 */
1031 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1032 xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1033 xmlCharEncodingAliasesNb++;
1034 return(0);
1035}
1036
1037/**
1038 * xmlDelEncodingAlias:
1039 * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1040 *
1041 * Unregisters an encoding alias @alias
1042 *
1043 * Returns 0 in case of success, -1 in case of error
1044 */
1045int
1046xmlDelEncodingAlias(const char *alias) {
1047 int i;
1048
1049 if (alias == NULL)
1050 return(-1);
1051
1052 if (xmlCharEncodingAliases == NULL)
1053 return(-1);
1054 /*
1055 * Walk down the list looking for a definition of the alias
1056 */
1057 for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1058 if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1059 xmlFree((char *) xmlCharEncodingAliases[i].name);
1060 xmlFree((char *) xmlCharEncodingAliases[i].alias);
1061 xmlCharEncodingAliasesNb--;
1062 memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1063 sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1064 return(0);
1065 }
1066 }
1067 return(-1);
1068}
1069
1070/**
Daniel Veillard27d88741999-05-29 11:51:49 +00001071 * xmlParseCharEncoding:
Daniel Veillard7f858501999-11-17 17:32:38 +00001072 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
Daniel Veillard27d88741999-05-29 11:51:49 +00001073 *
1074 * Conpare the string to the known encoding schemes already known. Note
1075 * that the comparison is case insensitive accordingly to the section
1076 * [XML] 4.3.3 Character Encoding in Entities.
1077 *
1078 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1079 * if not recognized.
1080 */
1081xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +00001082xmlParseCharEncoding(const char* name)
Daniel Veillard27d88741999-05-29 11:51:49 +00001083{
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001084 const char *alias;
Daniel Veillard27d88741999-05-29 11:51:49 +00001085 char upper[500];
1086 int i;
1087
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001088 if (name == NULL)
1089 return(XML_CHAR_ENCODING_NONE);
1090
1091 /*
1092 * Do the alias resolution
1093 */
1094 alias = xmlGetEncodingAlias(name);
1095 if (alias != NULL)
1096 name = alias;
1097
Daniel Veillard27d88741999-05-29 11:51:49 +00001098 for (i = 0;i < 499;i++) {
1099 upper[i] = toupper(name[i]);
1100 if (upper[i] == 0) break;
1101 }
1102 upper[i] = 0;
1103
1104 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1105 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1106 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1107
1108 /*
1109 * NOTE: if we were able to parse this, the endianness of UTF16 is
1110 * already found and in use
1111 */
1112 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1113 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1114
1115 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1116 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1117 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1118
1119 /*
1120 * NOTE: if we were able to parse this, the endianness of UCS4 is
1121 * already found and in use
1122 */
1123 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1124 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1125 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1126
1127
1128 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1129 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1130 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1131
1132 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1133 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1134 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1135
1136 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1137 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1138 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1139 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1140 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1141 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1142 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1143
1144 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001145 if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
Daniel Veillard27d88741999-05-29 11:51:49 +00001146 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001147
1148#ifdef DEBUG_ENCODING
1149 fprintf(stderr, "Unknown encoding %s\n", name);
1150#endif
Daniel Veillard27d88741999-05-29 11:51:49 +00001151 return(XML_CHAR_ENCODING_ERROR);
1152}
Daniel Veillard14fff061999-06-22 21:49:07 +00001153
Daniel Veillardbe803962000-06-28 23:40:59 +00001154/**
1155 * xmlGetCharEncodingName:
1156 * @enc: the encoding
1157 *
1158 * The "canonical" name for XML encoding.
1159 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1160 * Section 4.3.3 Character Encoding in Entities
1161 *
1162 * Returns the canonical name for the given encoding
1163 */
1164
1165const char*
1166xmlGetCharEncodingName(xmlCharEncoding enc) {
1167 switch (enc) {
1168 case XML_CHAR_ENCODING_ERROR:
1169 return(NULL);
1170 case XML_CHAR_ENCODING_NONE:
1171 return(NULL);
1172 case XML_CHAR_ENCODING_UTF8:
1173 return("UTF-8");
1174 case XML_CHAR_ENCODING_UTF16LE:
1175 return("UTF-16");
1176 case XML_CHAR_ENCODING_UTF16BE:
1177 return("UTF-16");
1178 case XML_CHAR_ENCODING_EBCDIC:
1179 return("EBCDIC");
1180 case XML_CHAR_ENCODING_UCS4LE:
1181 return("ISO-10646-UCS-4");
1182 case XML_CHAR_ENCODING_UCS4BE:
1183 return("ISO-10646-UCS-4");
1184 case XML_CHAR_ENCODING_UCS4_2143:
1185 return("ISO-10646-UCS-4");
1186 case XML_CHAR_ENCODING_UCS4_3412:
1187 return("ISO-10646-UCS-4");
1188 case XML_CHAR_ENCODING_UCS2:
1189 return("ISO-10646-UCS-2");
1190 case XML_CHAR_ENCODING_8859_1:
1191 return("ISO-8859-1");
1192 case XML_CHAR_ENCODING_8859_2:
1193 return("ISO-8859-2");
1194 case XML_CHAR_ENCODING_8859_3:
1195 return("ISO-8859-3");
1196 case XML_CHAR_ENCODING_8859_4:
1197 return("ISO-8859-4");
1198 case XML_CHAR_ENCODING_8859_5:
1199 return("ISO-8859-5");
1200 case XML_CHAR_ENCODING_8859_6:
1201 return("ISO-8859-6");
1202 case XML_CHAR_ENCODING_8859_7:
1203 return("ISO-8859-7");
1204 case XML_CHAR_ENCODING_8859_8:
1205 return("ISO-8859-8");
1206 case XML_CHAR_ENCODING_8859_9:
1207 return("ISO-8859-9");
1208 case XML_CHAR_ENCODING_2022_JP:
1209 return("ISO-2022-JP");
1210 case XML_CHAR_ENCODING_SHIFT_JIS:
1211 return("Shift-JIS");
1212 case XML_CHAR_ENCODING_EUC_JP:
1213 return("EUC-JP");
Daniel Veillard87b95392000-08-12 21:12:04 +00001214 case XML_CHAR_ENCODING_ASCII:
1215 return(NULL);
Daniel Veillardbe803962000-06-28 23:40:59 +00001216 }
1217 return(NULL);
1218}
1219
Daniel Veillard14fff061999-06-22 21:49:07 +00001220/****************************************************************
1221 * *
1222 * Char encoding handlers *
1223 * *
1224 ****************************************************************/
1225
1226/* the size should be growable, but it's not a big deal ... */
1227#define MAX_ENCODING_HANDLERS 50
1228static xmlCharEncodingHandlerPtr *handlers = NULL;
1229static int nbCharEncodingHandler = 0;
1230
1231/*
1232 * The default is UTF-8 for XML, that's also the default used for the
1233 * parser internals, so the default encoding handler is NULL
1234 */
1235
1236static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1237
1238/**
1239 * xmlNewCharEncodingHandler:
Daniel Veillard7f858501999-11-17 17:32:38 +00001240 * @name: the encoding name, in UTF-8 format (ASCII actually)
Daniel Veillard14fff061999-06-22 21:49:07 +00001241 * @input: the xmlCharEncodingInputFunc to read that encoding
1242 * @output: the xmlCharEncodingOutputFunc to write that encoding
1243 *
1244 * Create and registers an xmlCharEncodingHandler.
1245 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1246 */
1247xmlCharEncodingHandlerPtr
Daniel Veillardcf461992000-03-14 18:30:20 +00001248xmlNewCharEncodingHandler(const char *name,
1249 xmlCharEncodingInputFunc input,
Daniel Veillard14fff061999-06-22 21:49:07 +00001250 xmlCharEncodingOutputFunc output) {
1251 xmlCharEncodingHandlerPtr handler;
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001252 const char *alias;
Daniel Veillard14fff061999-06-22 21:49:07 +00001253 char upper[500];
1254 int i;
1255 char *up = 0;
1256
1257 /*
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001258 * Do the alias resolution
1259 */
1260 alias = xmlGetEncodingAlias(name);
1261 if (alias != NULL)
1262 name = alias;
1263
1264 /*
Daniel Veillard14fff061999-06-22 21:49:07 +00001265 * Keep only the uppercase version of the encoding.
1266 */
1267 if (name == NULL) {
1268 fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
1269 return(NULL);
1270 }
1271 for (i = 0;i < 499;i++) {
1272 upper[i] = toupper(name[i]);
1273 if (upper[i] == 0) break;
1274 }
1275 upper[i] = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +00001276 up = xmlMemStrdup(upper);
Daniel Veillard14fff061999-06-22 21:49:07 +00001277 if (up == NULL) {
1278 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
1279 return(NULL);
1280 }
1281
1282 /*
1283 * allocate and fill-up an handler block.
1284 */
1285 handler = (xmlCharEncodingHandlerPtr)
Daniel Veillard6454aec1999-09-02 22:04:43 +00001286 xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard14fff061999-06-22 21:49:07 +00001287 if (handler == NULL) {
1288 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
1289 return(NULL);
1290 }
1291 handler->input = input;
1292 handler->output = output;
1293 handler->name = up;
1294
Daniel Veillard87b95392000-08-12 21:12:04 +00001295#ifdef LIBXML_ICONV_ENABLED
1296 handler->iconv_in = NULL;
1297 handler->iconv_out = NULL;
1298#endif /* LIBXML_ICONV_ENABLED */
1299
Daniel Veillard14fff061999-06-22 21:49:07 +00001300 /*
1301 * registers and returns the handler.
1302 */
1303 xmlRegisterCharEncodingHandler(handler);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001304#ifdef DEBUG_ENCODING
1305 fprintf(stderr, "Registered encoding handler for %s\n", name);
1306#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001307 return(handler);
1308}
1309
1310/**
1311 * xmlInitCharEncodingHandlers:
1312 *
1313 * Initialize the char encoding support, it registers the default
1314 * encoding supported.
Daniel Veillard7f858501999-11-17 17:32:38 +00001315 * NOTE: while public, this function usually doesn't need to be called
Daniel Veillard14fff061999-06-22 21:49:07 +00001316 * in normal processing.
1317 */
1318void
1319xmlInitCharEncodingHandlers(void) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001320 unsigned short int tst = 0x1234;
1321 unsigned char *ptr = (unsigned char *) &tst;
1322
Daniel Veillard14fff061999-06-22 21:49:07 +00001323 if (handlers != NULL) return;
1324
1325 handlers = (xmlCharEncodingHandlerPtr *)
Daniel Veillard6454aec1999-09-02 22:04:43 +00001326 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
Daniel Veillard14fff061999-06-22 21:49:07 +00001327
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001328 if (*ptr == 0x12) xmlLittleEndian = 0;
1329 else if (*ptr == 0x34) xmlLittleEndian = 1;
1330 else fprintf(stderr, "Odd problem at endianness detection\n");
1331
Daniel Veillard14fff061999-06-22 21:49:07 +00001332 if (handlers == NULL) {
1333 fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
1334 return;
1335 }
1336 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillardcf461992000-03-14 18:30:20 +00001337 xmlUTF16LEHandler =
1338 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1339 xmlUTF16BEHandler =
1340 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
Daniel Veillard14fff061999-06-22 21:49:07 +00001341 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001342 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1343#ifdef LIBXML_HTML_ENABLED
1344 xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1345#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001346}
1347
1348/**
Daniel Veillarda819dac1999-11-24 18:04:22 +00001349 * xmlCleanupCharEncodingHandlers:
1350 *
1351 * Cleanup the memory allocated for the char encoding support, it
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001352 * unregisters all the encoding handlers and the aliases.
Daniel Veillarda819dac1999-11-24 18:04:22 +00001353 */
1354void
1355xmlCleanupCharEncodingHandlers(void) {
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001356 xmlCleanupEncodingAliases();
1357
Daniel Veillarda819dac1999-11-24 18:04:22 +00001358 if (handlers == NULL) return;
1359
1360 for (;nbCharEncodingHandler > 0;) {
1361 nbCharEncodingHandler--;
1362 if (handlers[nbCharEncodingHandler] != NULL) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001363 if (handlers[nbCharEncodingHandler]->name != NULL)
1364 xmlFree(handlers[nbCharEncodingHandler]->name);
Daniel Veillarda819dac1999-11-24 18:04:22 +00001365 xmlFree(handlers[nbCharEncodingHandler]);
1366 }
1367 }
1368 xmlFree(handlers);
1369 handlers = NULL;
1370 nbCharEncodingHandler = 0;
1371 xmlDefaultCharEncodingHandler = NULL;
1372}
1373
1374/**
Daniel Veillard14fff061999-06-22 21:49:07 +00001375 * xmlRegisterCharEncodingHandler:
1376 * @handler: the xmlCharEncodingHandlerPtr handler block
1377 *
1378 * Register the char encoding handler, surprizing, isn't it ?
1379 */
1380void
1381xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1382 if (handlers == NULL) xmlInitCharEncodingHandlers();
1383 if (handler == NULL) {
1384 fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
1385 return;
1386 }
1387
1388 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1389 fprintf(stderr,
1390 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1391 fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1392 return;
1393 }
1394 handlers[nbCharEncodingHandler++] = handler;
1395}
1396
1397/**
1398 * xmlGetCharEncodingHandler:
1399 * @enc: an xmlCharEncoding value.
1400 *
1401 * Search in the registrered set the handler able to read/write that encoding.
1402 *
1403 * Returns the handler or NULL if not found
1404 */
1405xmlCharEncodingHandlerPtr
1406xmlGetCharEncodingHandler(xmlCharEncoding enc) {
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001407 xmlCharEncodingHandlerPtr handler;
1408
Daniel Veillard14fff061999-06-22 21:49:07 +00001409 if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillardcf461992000-03-14 18:30:20 +00001410 switch (enc) {
1411 case XML_CHAR_ENCODING_ERROR:
1412 return(NULL);
1413 case XML_CHAR_ENCODING_NONE:
1414 return(NULL);
1415 case XML_CHAR_ENCODING_UTF8:
1416 return(NULL);
1417 case XML_CHAR_ENCODING_UTF16LE:
1418 return(xmlUTF16LEHandler);
1419 case XML_CHAR_ENCODING_UTF16BE:
1420 return(xmlUTF16BEHandler);
1421 case XML_CHAR_ENCODING_EBCDIC:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001422 handler = xmlFindCharEncodingHandler("EBCDIC");
1423 if (handler != NULL) return(handler);
1424 handler = xmlFindCharEncodingHandler("ebcdic");
1425 if (handler != NULL) return(handler);
1426 break;
Daniel Veillardbe803962000-06-28 23:40:59 +00001427 case XML_CHAR_ENCODING_UCS4BE:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001428 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1429 if (handler != NULL) return(handler);
1430 handler = xmlFindCharEncodingHandler("UCS-4");
1431 if (handler != NULL) return(handler);
1432 handler = xmlFindCharEncodingHandler("UCS4");
1433 if (handler != NULL) return(handler);
1434 break;
Daniel Veillardbe803962000-06-28 23:40:59 +00001435 case XML_CHAR_ENCODING_UCS4LE:
1436 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1437 if (handler != NULL) return(handler);
1438 handler = xmlFindCharEncodingHandler("UCS-4");
1439 if (handler != NULL) return(handler);
1440 handler = xmlFindCharEncodingHandler("UCS4");
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001441 if (handler != NULL) return(handler);
1442 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001443 case XML_CHAR_ENCODING_UCS4_2143:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001444 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001445 case XML_CHAR_ENCODING_UCS4_3412:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001446 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001447 case XML_CHAR_ENCODING_UCS2:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001448 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1449 if (handler != NULL) return(handler);
1450 handler = xmlFindCharEncodingHandler("UCS-2");
1451 if (handler != NULL) return(handler);
1452 handler = xmlFindCharEncodingHandler("UCS2");
1453 if (handler != NULL) return(handler);
1454 break;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001455
1456 /*
1457 * We used to keep ISO Latin encodings native in the
1458 * generated data. This led to so many problems that
1459 * this has been removed. One can still change this
1460 * back by registering no-ops encoders for those
1461 */
Daniel Veillardcf461992000-03-14 18:30:20 +00001462 case XML_CHAR_ENCODING_8859_1:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001463 handler = xmlFindCharEncodingHandler("ISO-8859-1");
1464 if (handler != NULL) return(handler);
1465 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001466 case XML_CHAR_ENCODING_8859_2:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001467 handler = xmlFindCharEncodingHandler("ISO-8859-2");
1468 if (handler != NULL) return(handler);
1469 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001470 case XML_CHAR_ENCODING_8859_3:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001471 handler = xmlFindCharEncodingHandler("ISO-8859-3");
1472 if (handler != NULL) return(handler);
1473 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001474 case XML_CHAR_ENCODING_8859_4:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001475 handler = xmlFindCharEncodingHandler("ISO-8859-4");
1476 if (handler != NULL) return(handler);
1477 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001478 case XML_CHAR_ENCODING_8859_5:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001479 handler = xmlFindCharEncodingHandler("ISO-8859-5");
1480 if (handler != NULL) return(handler);
1481 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001482 case XML_CHAR_ENCODING_8859_6:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001483 handler = xmlFindCharEncodingHandler("ISO-8859-6");
1484 if (handler != NULL) return(handler);
1485 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001486 case XML_CHAR_ENCODING_8859_7:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001487 handler = xmlFindCharEncodingHandler("ISO-8859-7");
1488 if (handler != NULL) return(handler);
1489 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001490 case XML_CHAR_ENCODING_8859_8:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001491 handler = xmlFindCharEncodingHandler("ISO-8859-8");
1492 if (handler != NULL) return(handler);
1493 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001494 case XML_CHAR_ENCODING_8859_9:
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001495 handler = xmlFindCharEncodingHandler("ISO-8859-9");
1496 if (handler != NULL) return(handler);
1497 break;
1498
1499
Daniel Veillardcf461992000-03-14 18:30:20 +00001500 case XML_CHAR_ENCODING_2022_JP:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001501 handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1502 if (handler != NULL) return(handler);
1503 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001504 case XML_CHAR_ENCODING_SHIFT_JIS:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001505 handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1506 if (handler != NULL) return(handler);
1507 handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1508 if (handler != NULL) return(handler);
1509 handler = xmlFindCharEncodingHandler("Shift_JIS");
1510 if (handler != NULL) return(handler);
1511 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001512 case XML_CHAR_ENCODING_EUC_JP:
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001513 handler = xmlFindCharEncodingHandler("EUC-JP");
1514 if (handler != NULL) return(handler);
1515 break;
1516 default:
1517 break;
Daniel Veillardcf461992000-03-14 18:30:20 +00001518 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001519
1520#ifdef DEBUG_ENCODING
1521 fprintf(stderr, "No handler found for encoding %d\n", enc);
1522#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001523 return(NULL);
1524}
1525
1526/**
1527 * xmlGetCharEncodingHandler:
1528 * @enc: a string describing the char encoding.
1529 *
1530 * Search in the registrered set the handler able to read/write that encoding.
1531 *
1532 * Returns the handler or NULL if not found
1533 */
1534xmlCharEncodingHandlerPtr
1535xmlFindCharEncodingHandler(const char *name) {
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001536 const char *nalias;
1537 const char *norig;
Daniel Veillardbe803962000-06-28 23:40:59 +00001538 xmlCharEncoding alias;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001539#ifdef LIBXML_ICONV_ENABLED
Daniel Veillard3f6f7f62000-06-30 17:58:25 +00001540 xmlCharEncodingHandlerPtr enc;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001541 iconv_t icv_in, icv_out;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001542#endif /* LIBXML_ICONV_ENABLED */
1543 char upper[100];
Daniel Veillard14fff061999-06-22 21:49:07 +00001544 int i;
1545
1546 if (handlers == NULL) xmlInitCharEncodingHandlers();
1547 if (name == NULL) return(xmlDefaultCharEncodingHandler);
1548 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1549
Daniel Veillardbe803962000-06-28 23:40:59 +00001550 /*
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001551 * Do the alias resolution
1552 */
1553 norig = name;
1554 nalias = xmlGetEncodingAlias(name);
1555 if (nalias != NULL)
1556 name = nalias;
1557
1558 /*
Daniel Veillardbe803962000-06-28 23:40:59 +00001559 * Check first for directly registered encoding names
1560 */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001561 for (i = 0;i < 99;i++) {
Daniel Veillard14fff061999-06-22 21:49:07 +00001562 upper[i] = toupper(name[i]);
1563 if (upper[i] == 0) break;
1564 }
1565 upper[i] = 0;
1566
1567 for (i = 0;i < nbCharEncodingHandler; i++)
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001568 if (!strcmp(upper, handlers[i]->name)) {
1569#ifdef DEBUG_ENCODING
1570 fprintf(stderr, "Found registered handler for encoding %s\n", name);
1571#endif
Daniel Veillard14fff061999-06-22 21:49:07 +00001572 return(handlers[i]);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001573 }
Daniel Veillard14fff061999-06-22 21:49:07 +00001574
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001575#ifdef LIBXML_ICONV_ENABLED
1576 /* check whether iconv can handle this */
1577 icv_in = iconv_open("UTF-8", name);
1578 icv_out = iconv_open(name, "UTF-8");
1579 if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001580 enc = (xmlCharEncodingHandlerPtr)
1581 xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001582 if (enc == NULL) {
1583 iconv_close(icv_in);
1584 iconv_close(icv_out);
1585 return(NULL);
1586 }
Daniel Veillard365e13b2000-07-02 07:56:37 +00001587 enc->name = xmlMemStrdup(name);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001588 enc->input = NULL;
1589 enc->output = NULL;
1590 enc->iconv_in = icv_in;
1591 enc->iconv_out = icv_out;
1592#ifdef DEBUG_ENCODING
1593 fprintf(stderr, "Found iconv handler for encoding %s\n", name);
1594#endif
1595 return enc;
1596 } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1597 fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
1598 }
1599#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillardbe803962000-06-28 23:40:59 +00001600
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001601#ifdef DEBUG_ENCODING
1602 fprintf(stderr, "No handler found for encoding %s\n", name);
1603#endif
Daniel Veillardbe803962000-06-28 23:40:59 +00001604
1605 /*
1606 * Fallback using the canonical names
1607 */
Daniel Veillardf0cc7cc2000-08-26 21:40:43 +00001608 alias = xmlParseCharEncoding(norig);
Daniel Veillardbe803962000-06-28 23:40:59 +00001609 if (alias != XML_CHAR_ENCODING_ERROR) {
1610 const char* canon;
1611 canon = xmlGetCharEncodingName(alias);
1612 if ((canon != NULL) && (strcmp(name, canon))) {
1613 return(xmlFindCharEncodingHandler(canon));
1614 }
1615 }
1616
Daniel Veillard14fff061999-06-22 21:49:07 +00001617 return(NULL);
1618}
1619
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001620#ifdef LIBXML_ICONV_ENABLED
1621/**
1622 * xmlIconvWrapper:
1623 * @cd: iconv converter data structure
1624 * @out: a pointer to an array of bytes to store the result
1625 * @outlen: the length of @out
1626 * @in: a pointer to an array of ISO Latin 1 chars
1627 * @inlen: the length of @in
1628 *
1629 * Returns 0 if success, or
1630 * -1 by lack of space, or
1631 * -2 if the transcoding fails (for *in is not valid utf8 string or
1632 * the result of transformation can't fit into the encoding we want), or
1633 * -3 if there the last byte can't form a single output char.
1634 *
1635 * The value of @inlen after return is the number of octets consumed
1636 * as the return value is positive, else unpredictiable.
1637 * The value of @outlen after return is the number of ocetes consumed.
1638 */
1639static int
1640xmlIconvWrapper(iconv_t cd,
1641 unsigned char *out, int *outlen,
1642 const unsigned char *in, int *inlen) {
1643
1644 size_t icv_inlen = *inlen, icv_outlen = *outlen;
1645 const char *icv_in = (const char *) in;
1646 char *icv_out = (char *) out;
1647 int ret;
1648
1649 ret = iconv(cd,
1650 &icv_in, &icv_inlen,
1651 &icv_out, &icv_outlen);
Daniel Veillardbe803962000-06-28 23:40:59 +00001652 if (in != NULL) {
1653 *inlen -= icv_inlen;
1654 *outlen -= icv_outlen;
1655 } else {
1656 *inlen = 0;
1657 *outlen = 0;
1658 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001659 if (icv_inlen != 0 || ret == (size_t) -1) {
1660#ifdef EILSEQ
1661 if (errno == EILSEQ) {
1662 return -2;
1663 } else
1664#endif
1665#ifdef E2BIG
1666 if (errno == E2BIG) {
1667 return -1;
1668 } else
1669#endif
1670#ifdef EINVAL
1671 if (errno == EINVAL) {
1672 return -3;
1673 }
1674#endif
1675 else {
1676 return -3;
1677 }
1678 }
1679 return 0;
1680}
1681#endif /* LIBXML_ICONV_ENABLED */
1682
1683/**
Daniel Veillardbe803962000-06-28 23:40:59 +00001684 * xmlCharEncFirstLine:
1685 * @handler: char enconding transformation data structure
1686 * @out: an xmlBuffer for the output.
1687 * @in: an xmlBuffer for the input
1688 *
1689 * Front-end for the encoding handler input function, but handle only
1690 * the very first line, i.e. limit itself to 45 chars.
1691 *
1692 * Returns the number of byte written if success, or
1693 * -1 general error
1694 * -2 if the transcoding fails (for *in is not valid utf8 string or
1695 * the result of transformation can't fit into the encoding we want), or
1696 */
1697int
1698xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1699 xmlBufferPtr in) {
1700 int ret = -2;
1701 int written;
1702 int toconv;
1703
1704 if (handler == NULL) return(-1);
1705 if (out == NULL) return(-1);
1706 if (in == NULL) return(-1);
1707
1708 written = out->size - out->use;
1709 toconv = in->use;
1710 if (toconv * 2 >= written) {
1711 xmlBufferGrow(out, toconv);
1712 written = out->size - out->use - 1;
1713 }
1714
1715 /*
1716 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1717 * 45 chars should be sufficient to reach the end of the encoding
1718 * decalration without going too far inside the document content.
1719 */
1720 written = 45;
1721
1722 if (handler->input != NULL) {
1723 ret = handler->input(&out->content[out->use], &written,
1724 in->content, &toconv);
1725 xmlBufferShrink(in, toconv);
1726 out->use += written;
1727 out->content[out->use] = 0;
1728 }
1729#ifdef LIBXML_ICONV_ENABLED
1730 else if (handler->iconv_in != NULL) {
1731 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1732 &written, in->content, &toconv);
1733 xmlBufferShrink(in, toconv);
1734 out->use += written;
1735 out->content[out->use] = 0;
1736 if (ret == -1) ret = -3;
1737 }
1738#endif /* LIBXML_ICONV_ENABLED */
1739#ifdef DEBUG_ENCODING
1740 switch (ret) {
1741 case 0:
1742 fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1743 toconv, written);
1744 break;
1745 case -1:
1746 fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1747 toconv, written, in->use);
1748 break;
1749 case -2:
1750 fprintf(stderr, "input conversion failed due to input error\n");
1751 break;
1752 case -3:
1753 fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1754 toconv, written, in->use);
1755 break;
1756 default:
1757 fprintf(stderr,"Unknown input conversion failed %d\n", ret);
1758 }
1759#endif
1760 /*
1761 * Ignore when input buffer is not on a boundary
1762 */
1763 if (ret == -3) ret = 0;
1764 if (ret == -1) ret = 0;
1765 return(ret);
1766}
1767
1768/**
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001769 * xmlCharEncInFunc:
1770 * @handler: char enconding transformation data structure
1771 * @out: an xmlBuffer for the output.
1772 * @in: an xmlBuffer for the input
1773 *
1774 * Generic front-end for the encoding handler input function
1775 *
1776 * Returns the number of byte written if success, or
1777 * -1 general error
1778 * -2 if the transcoding fails (for *in is not valid utf8 string or
1779 * the result of transformation can't fit into the encoding we want), or
1780 */
1781int
1782xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1783 xmlBufferPtr in) {
1784 int ret = -2;
1785 int written;
1786 int toconv;
1787
1788 if (handler == NULL) return(-1);
1789 if (out == NULL) return(-1);
1790 if (in == NULL) return(-1);
1791
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001792 toconv = in->use;
Daniel Veillard87b95392000-08-12 21:12:04 +00001793 if (toconv == 0)
1794 return(0);
1795 written = out->size - out->use;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001796 if (toconv * 2 >= written) {
1797 xmlBufferGrow(out, toconv * 2);
1798 written = out->size - out->use - 1;
1799 }
1800 if (handler->input != NULL) {
1801 ret = handler->input(&out->content[out->use], &written,
1802 in->content, &toconv);
1803 xmlBufferShrink(in, toconv);
1804 out->use += written;
1805 out->content[out->use] = 0;
1806 }
1807#ifdef LIBXML_ICONV_ENABLED
1808 else if (handler->iconv_in != NULL) {
1809 ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1810 &written, in->content, &toconv);
1811 xmlBufferShrink(in, toconv);
1812 out->use += written;
1813 out->content[out->use] = 0;
1814 if (ret == -1) ret = -3;
1815 }
1816#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001817 switch (ret) {
Daniel Veillardbe803962000-06-28 23:40:59 +00001818#ifdef DEBUG_ENCODING
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001819 case 0:
1820 fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1821 toconv, written);
1822 break;
1823 case -1:
1824 fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1825 toconv, written, in->use);
1826 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001827 case -3:
1828 fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1829 toconv, written, in->use);
1830 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001831#endif
Daniel Veillardbe803962000-06-28 23:40:59 +00001832 case -2:
1833 fprintf(stderr, "input conversion failed due to input error\n");
1834 fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1835 in->content[0], in->content[1],
1836 in->content[2], in->content[3]);
1837 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001838 /*
1839 * Ignore when input buffer is not on a boundary
1840 */
1841 if (ret == -3) ret = 0;
1842 return(ret);
1843}
1844
1845/**
1846 * xmlCharEncOutFunc:
1847 * @handler: char enconding transformation data structure
1848 * @out: an xmlBuffer for the output.
1849 * @in: an xmlBuffer for the input
1850 *
1851 * Generic front-end for the encoding handler output function
Daniel Veillardbe803962000-06-28 23:40:59 +00001852 * a first call with @in == NULL has to be made firs to initiate the
1853 * output in case of non-stateless encoding needing to initiate their
1854 * state or the output (like the BOM in UTF16).
1855 * In case of UTF8 sequence conversion errors for the given encoder,
1856 * the content will be automatically remapped to a CharRef sequence.
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001857 *
1858 * Returns the number of byte written if success, or
1859 * -1 general error
1860 * -2 if the transcoding fails (for *in is not valid utf8 string or
1861 * the result of transformation can't fit into the encoding we want), or
1862 */
1863int
1864xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1865 xmlBufferPtr in) {
1866 int ret = -2;
1867 int written;
1868 int toconv;
Daniel Veillardbe803962000-06-28 23:40:59 +00001869 int output = 0;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001870
1871 if (handler == NULL) return(-1);
1872 if (out == NULL) return(-1);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001873
Daniel Veillardbe803962000-06-28 23:40:59 +00001874retry:
1875
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001876 written = out->size - out->use;
Daniel Veillardbe803962000-06-28 23:40:59 +00001877
1878 /*
1879 * First specific handling of in = NULL, i.e. the initialization call
1880 */
1881 if (in == NULL) {
1882 toconv = 0;
1883 if (handler->output != NULL) {
1884 ret = handler->output(&out->content[out->use], &written,
1885 NULL, &toconv);
1886 out->use += written;
1887 out->content[out->use] = 0;
1888 }
1889#ifdef LIBXML_ICONV_ENABLED
1890 else if (handler->iconv_out != NULL) {
1891 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1892 &written, NULL, &toconv);
1893 out->use += written;
1894 out->content[out->use] = 0;
1895 }
1896#endif /* LIBXML_ICONV_ENABLED */
1897#ifdef DEBUG_ENCODING
1898 fprintf(stderr, "initialized encoder\n");
1899#endif
1900 return(0);
1901 }
1902
1903 /*
1904 * Convertion itself.
1905 */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001906 toconv = in->use;
Daniel Veillard87b95392000-08-12 21:12:04 +00001907 if (toconv == 0)
1908 return(0);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001909 if (toconv * 2 >= written) {
1910 xmlBufferGrow(out, toconv * 2);
1911 written = out->size - out->use - 1;
1912 }
1913 if (handler->output != NULL) {
1914 ret = handler->output(&out->content[out->use], &written,
Daniel Veillardbe803962000-06-28 23:40:59 +00001915 in->content, &toconv);
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001916 xmlBufferShrink(in, toconv);
1917 out->use += written;
1918 out->content[out->use] = 0;
1919 }
1920#ifdef LIBXML_ICONV_ENABLED
1921 else if (handler->iconv_out != NULL) {
1922 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1923 &written, in->content, &toconv);
1924 xmlBufferShrink(in, toconv);
1925 out->use += written;
1926 out->content[out->use] = 0;
1927 if (ret == -1) ret = -3;
1928 }
1929#endif /* LIBXML_ICONV_ENABLED */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001930 else {
1931 fprintf(stderr, "xmlCharEncOutFunc: no output function !\n");
1932 return(-1);
1933 }
Daniel Veillardbe803962000-06-28 23:40:59 +00001934
1935 if (ret >= 0) output += ret;
1936
1937 /*
1938 * Attempt to handle error cases
1939 */
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001940 switch (ret) {
Daniel Veillardbe803962000-06-28 23:40:59 +00001941#ifdef DEBUG_ENCODING
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001942 case 0:
1943 fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1944 toconv, written);
1945 break;
1946 case -1:
1947 fprintf(stderr, "output conversion failed by lack of space\n");
1948 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001949 case -3:
1950 fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
1951 toconv, written, in->use);
1952 break;
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001953#endif
Daniel Veillardbe803962000-06-28 23:40:59 +00001954 case -2: {
1955 int len = in->use;
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001956 const xmlChar *utf = (const xmlChar *) in->content;
Daniel Veillardbe803962000-06-28 23:40:59 +00001957 int cur;
1958
1959 cur = xmlGetUTF8Char(utf, &len);
1960 if (cur > 0) {
1961 xmlChar charref[20];
1962
1963#ifdef DEBUG_ENCODING
1964 fprintf(stderr, "handling output conversion error\n");
1965 fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1966 in->content[0], in->content[1],
1967 in->content[2], in->content[3]);
1968#endif
1969 /*
1970 * Removes the UTF8 sequence, and replace it by a charref
1971 * and continue the transcoding phase, hoping the error
1972 * did not mangle the encoder state.
1973 */
Daniel Veillard32bc74e2000-07-14 14:49:25 +00001974 sprintf((char *) charref, "&#x%X;", cur);
Daniel Veillardbe803962000-06-28 23:40:59 +00001975 xmlBufferShrink(in, len);
1976 xmlBufferAddHead(in, charref, -1);
1977
1978 goto retry;
1979 } else {
1980 fprintf(stderr, "output conversion failed due to conv error\n");
1981 fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1982 in->content[0], in->content[1],
1983 in->content[2], in->content[3]);
Daniel Veillard87b95392000-08-12 21:12:04 +00001984 in->content[0] = ' ';
Daniel Veillardbe803962000-06-28 23:40:59 +00001985 }
1986 break;
1987 }
1988 }
Daniel Veillard496a1cf2000-05-03 14:20:55 +00001989 return(ret);
1990}
1991
1992/**
1993 * xmlCharEncCloseFunc:
1994 * @handler: char enconding transformation data structure
1995 *
1996 * Generic front-end for hencoding handler close function
1997 *
1998 * Returns 0 if success, or -1 in case of error
1999 */
2000int
2001xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2002 int ret = 0;
2003 if (handler == NULL) return(-1);
2004 if (handler->name == NULL) return(-1);
2005#ifdef LIBXML_ICONV_ENABLED
2006 /*
2007 * Iconv handlers can be oused only once, free the whole block.
2008 * and the associated icon resources.
2009 */
2010 if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2011 if (handler->name != NULL)
2012 xmlFree(handler->name);
2013 handler->name = NULL;
2014 if (handler->iconv_out != NULL) {
2015 if (iconv_close(handler->iconv_out))
2016 ret = -1;
2017 handler->iconv_out = NULL;
2018 }
2019 if (handler->iconv_in != NULL) {
2020 if (iconv_close(handler->iconv_in))
2021 ret = -1;
2022 handler->iconv_in = NULL;
2023 }
2024 xmlFree(handler);
2025 }
2026#endif /* LIBXML_ICONV_ENABLED */
2027#ifdef DEBUG_ENCODING
2028 if (ret)
2029 fprintf(stderr, "failed to close the encoding handler\n");
2030 else
2031 fprintf(stderr, "closed the encoding handler\n");
2032
2033#endif
2034 return(ret);
2035}
2036