blob: 420092915edcc1bb446cd535cb0dfd5939b2de98 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
Daniel Veillard14fff061999-06-22 21:49:07 +000015 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
Daniel Veillard891e4041998-10-19 00:43:02 +000016 *
17 * See Copyright for the status of this software.
18 *
Daniel Veillard891e4041998-10-19 00:43:02 +000019 * Daniel.Veillard@w3.org
20 */
21
Daniel Veillard3c558c31999-12-22 11:30:41 +000022#ifdef WIN32
23#include "win32config.h"
24#else
Daniel Veillardb96e6431999-08-29 21:02:19 +000025#include "config.h"
Daniel Veillard7f7d1111999-09-22 09:46:25 +000026#endif
27
Daniel Veillard14fff061999-06-22 21:49:07 +000028#include <stdio.h>
Daniel Veillard7f7d1111999-09-22 09:46:25 +000029#include <string.h>
30
31#ifdef HAVE_CTYPE_H
32#include <ctype.h>
33#endif
Daniel Veillard6d3bf1f1999-12-16 17:52:19 +000034#ifdef HAVE_STDLIB_H
35#include <stdlib.h>
36#endif
Daniel Veillard361d8452000-04-03 19:48:13 +000037#include <libxml/encoding.h>
38#include <libxml/xmlmemory.h>
Daniel Veillard891e4041998-10-19 00:43:02 +000039
Daniel Veillardcf461992000-03-14 18:30:20 +000040xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
41xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
Daniel Veillardb05deb71999-08-10 19:04:08 +000042
Daniel Veillard0ba4d531998-11-01 19:34:31 +000043/*
44 * From rfc2044: encoding of the Unicode values on UTF-8:
45 *
46 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
47 * 0000 0000-0000 007F 0xxxxxxx
48 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
49 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
50 *
51 * I hope we won't use values > 0xFFFF anytime soon !
52 */
53
Daniel Veillard97b58771998-10-20 06:14:16 +000054/**
Daniel Veillardcf461992000-03-14 18:30:20 +000055 * xmlCheckUTF8: Check utf-8 string for legality.
56 * @utf: Pointer to putative utf-8 encoded string.
57 *
58 * Checks @utf for being valid utf-8. @utf is assumed to be
59 * null-terminated. This function is not super-strict, as it will
60 * allow longer utf-8 sequences than necessary. Note that Java is
61 * capable of producing these sequences if provoked. Also note, this
62 * routine checks for the 4-byte maxiumum size, but does not check for
63 * 0x10ffff maximum value.
64 *
65 * Return value: true if @utf is valid.
66 **/
67int
68xmlCheckUTF8(const unsigned char *utf)
69{
70 int ix;
71 unsigned char c;
72
73 for (ix = 0; (c = utf[ix]);) {
74 if (c & 0x80) {
75 if ((utf[ix + 1] & 0xc0) != 0x80)
76 return(0);
77 if ((c & 0xe0) == 0xe0) {
78 if ((utf[ix + 2] & 0xc0) != 0x80)
79 return(0);
80 if ((c & 0xf0) == 0xf0) {
81 if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
82 return(0);
83 ix += 4;
84 /* 4-byte code */
85 } else
86 /* 3-byte code */
87 ix += 3;
88 } else
89 /* 2-byte code */
90 ix += 2;
91 } else
92 /* 1-byte code */
93 ix++;
94 }
95 return(1);
96}
97
98/**
Daniel Veillard97b58771998-10-20 06:14:16 +000099 * isolat1ToUTF8:
Daniel Veillard7f858501999-11-17 17:32:38 +0000100 * @out: a pointer to an array of bytes to store the result
101 * @outlen: the length of @out
102 * @in: a pointer to an array of ISO Latin 1 chars
103 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000104 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000105 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
106 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +0000107 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000108 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000109int
Daniel Veillardcf461992000-03-14 18:30:20 +0000110isolat1ToUTF8(unsigned char* out, int outlen,
111 const unsigned char* in, int *inlen) {
Daniel Veillard891e4041998-10-19 00:43:02 +0000112 unsigned char* outstart= out;
113 unsigned char* outend= out+outlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000114 const unsigned char* inend= in+*inlen;
Daniel Veillard891e4041998-10-19 00:43:02 +0000115 unsigned char c;
116
117 while (in < inend) {
118 c= *in++;
119 if (c < 0x80) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000120 if (out >= outend) return(-1);
Daniel Veillard891e4041998-10-19 00:43:02 +0000121 *out++ = c;
122 }
123 else {
Daniel Veillardcf461992000-03-14 18:30:20 +0000124 if (out >= outend) return(-1);
Daniel Veillard891e4041998-10-19 00:43:02 +0000125 *out++ = 0xC0 | (c >> 6);
Daniel Veillardcf461992000-03-14 18:30:20 +0000126 if (out >= outend) return(-1);
Daniel Veillard891e4041998-10-19 00:43:02 +0000127 *out++ = 0x80 | (0x3F & c);
128 }
129 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000130 return(out-outstart);
Daniel Veillard891e4041998-10-19 00:43:02 +0000131}
132
Daniel Veillard97b58771998-10-20 06:14:16 +0000133/**
134 * UTF8Toisolat1:
Daniel Veillard7f858501999-11-17 17:32:38 +0000135 * @out: a pointer to an array of bytes to store the result
136 * @outlen: the length of @out
137 * @in: a pointer to an array of UTF-8 chars
138 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000139 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000140 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
141 * block of chars out.
Daniel Veillardb96e6431999-08-29 21:02:19 +0000142 * TODO: UTF8Toisolat1 need a fallback mechanism ...
143 *
Daniel Veillard1e346af1999-02-22 10:33:01 +0000144 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillardcf461992000-03-14 18:30:20 +0000145 * if the transcoding fails (for *in is not valid utf8 string or
146 * the result of transformation can't fit into the encoding we want)
147 * The value of @inlen after return is the number of octets consumed
148 * as the return value is positive, else unpredictiable.
Daniel Veillard891e4041998-10-19 00:43:02 +0000149 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000150int
Daniel Veillardcf461992000-03-14 18:30:20 +0000151UTF8Toisolat1(unsigned char* out, int outlen,
152 const unsigned char* in, int *inlen) {
Daniel Veillard891e4041998-10-19 00:43:02 +0000153 unsigned char* outstart= out;
154 unsigned char* outend= out+outlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000155 const unsigned char* inend= in+*inlen;
Daniel Veillardccb09631998-10-27 06:21:04 +0000156 unsigned char c;
Daniel Veillard891e4041998-10-19 00:43:02 +0000157
158 while (in < inend) {
159 c= *in++;
160 if (c < 0x80) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000161 if (out >= outend) return(-1);
Daniel Veillard891e4041998-10-19 00:43:02 +0000162 *out++= c;
163 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000164 else if (in == inend) {
165 *inlen -= 1;
166 break;
167 }
168 else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
169 /* a two byte utf-8 and can be encoding as isolate1 */
Daniel Veillard891e4041998-10-19 00:43:02 +0000170 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
Daniel Veillardcf461992000-03-14 18:30:20 +0000171 }
172 else
173 return(-2);
174 /* TODO : some should be represent as "&#x____;" */
Daniel Veillard891e4041998-10-19 00:43:02 +0000175 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000176 return(out-outstart);
Daniel Veillard891e4041998-10-19 00:43:02 +0000177}
178
Daniel Veillard97b58771998-10-20 06:14:16 +0000179/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000180 * UTF16LEToUTF8:
Daniel Veillard7f858501999-11-17 17:32:38 +0000181 * @out: a pointer to an array of bytes to store the result
182 * @outlen: the length of @out
Daniel Veillardcf461992000-03-14 18:30:20 +0000183 * @inb: a pointer to an array of UTF-16LE passwd as a byte array
184 * @inlenb: the length of @in in UTF-16LE chars
Daniel Veillard97b58771998-10-20 06:14:16 +0000185 *
Daniel Veillardcf461992000-03-14 18:30:20 +0000186 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
187 * block of chars out. This function assume the endian properity
188 * is the same between the native type of this machine and the
189 * inputed one.
190 *
191 * Returns the number of byte written, or -1 by lack of space, or -2
192 * if the transcoding fails (for *in is not valid utf16 string)
193 * The value of *inlen after return is the number of octets consumed
194 * as the return value is positive, else unpredictiable.
Daniel Veillard891e4041998-10-19 00:43:02 +0000195 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000196int
Daniel Veillardcf461992000-03-14 18:30:20 +0000197UTF16LEToUTF8(unsigned char* out, int outlen,
198 const unsigned char* inb, int *inlenb)
Daniel Veillard891e4041998-10-19 00:43:02 +0000199{
200 unsigned char* outstart= out;
201 unsigned char* outend= out+outlen;
Daniel Veillardcf461992000-03-14 18:30:20 +0000202 unsigned short* in = (unsigned short*) inb;
203 unsigned short* inend;
204 unsigned int c, d, inlen;
205 unsigned char *tmp;
Daniel Veillard891e4041998-10-19 00:43:02 +0000206 int bits;
207
Daniel Veillardcf461992000-03-14 18:30:20 +0000208 if ((*inlenb % 2) == 1)
209 (*inlenb)--;
210 inlen = *inlenb / 2;
211 inend= in + inlen;
Daniel Veillard891e4041998-10-19 00:43:02 +0000212 while (in < inend) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000213#ifdef BIG_ENDIAN
214 tmp = (unsigned char *) in;
215 c = *tmp++;
216 c = c | (((unsigned int)*tmp) << 8);
217 in++;
218#else /* BIG_ENDIAN */
Daniel Veillard891e4041998-10-19 00:43:02 +0000219 c= *in++;
Daniel Veillardcf461992000-03-14 18:30:20 +0000220#endif /* BIG_ENDIAN */
Daniel Veillard891e4041998-10-19 00:43:02 +0000221 if ((c & 0xFC00) == 0xD800) { /* surrogates */
Daniel Veillardcf461992000-03-14 18:30:20 +0000222 if (in >= inend) { /* (in > inend) shouldn't happens */
223 (*inlenb) -= 2;
224 break;
225 }
226#ifdef BIG_ENDIAN
227 tmp = (unsigned char *) in;
228 d = *tmp++;
229 d = d | (((unsigned int)*tmp) << 8);
230 in++;
231#else /* BIG_ENDIAN */
232 d = *in++;
233#endif /* BIG_ENDIAN */
234 if ((d & 0xFC00) == 0xDC00) {
Daniel Veillard891e4041998-10-19 00:43:02 +0000235 c &= 0x03FF;
236 c <<= 10;
237 c |= d & 0x03FF;
238 c += 0x10000;
239 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000240 else
241 return(-2);
Daniel Veillard891e4041998-10-19 00:43:02 +0000242 }
243
Daniel Veillardcf461992000-03-14 18:30:20 +0000244 /* assertion: c is a single UTF-4 value */
245 if (out >= outend)
246 return(-1);
Daniel Veillard891e4041998-10-19 00:43:02 +0000247 if (c < 0x80) { *out++= c; bits= -6; }
Daniel Veillardcf461992000-03-14 18:30:20 +0000248 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
249 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
250 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
Daniel Veillard891e4041998-10-19 00:43:02 +0000251
Daniel Veillardcf461992000-03-14 18:30:20 +0000252 for ( ; bits >= 0; bits-= 6) {
253 if (out >= outend)
254 return(-1);
255 *out++= ((c >> bits) & 0x3F) | 0x80;
Daniel Veillard891e4041998-10-19 00:43:02 +0000256 }
257 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000258 return(out-outstart);
Daniel Veillard891e4041998-10-19 00:43:02 +0000259}
260
Daniel Veillard97b58771998-10-20 06:14:16 +0000261/**
Daniel Veillardcf461992000-03-14 18:30:20 +0000262 * UTF8ToUTF16LE:
263 * @outb: a pointer to an array of bytes to store the result
264 * @outlen: the length of @outb
Daniel Veillard7f858501999-11-17 17:32:38 +0000265 * @in: a pointer to an array of UTF-8 chars
266 * @inlen: the length of @in
Daniel Veillard97b58771998-10-20 06:14:16 +0000267 *
Daniel Veillardcf461992000-03-14 18:30:20 +0000268 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
Daniel Veillard891e4041998-10-19 00:43:02 +0000269 * block of chars out.
Daniel Veillardcf461992000-03-14 18:30:20 +0000270 * TODO: UTF8ToUTF16LE need a fallback mechanism ...
Daniel Veillardb96e6431999-08-29 21:02:19 +0000271 *
Daniel Veillard1e346af1999-02-22 10:33:01 +0000272 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillardcf461992000-03-14 18:30:20 +0000273 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000274 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000275int
Daniel Veillardcf461992000-03-14 18:30:20 +0000276UTF8ToUTF16LE(unsigned char* outb, int outlen,
277 const unsigned char* in, int *inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000278{
Daniel Veillardcf461992000-03-14 18:30:20 +0000279 unsigned short* out = (unsigned short*) outb;
Daniel Veillard891e4041998-10-19 00:43:02 +0000280 unsigned short* outstart= out;
Daniel Veillardcf461992000-03-14 18:30:20 +0000281 unsigned short* outend;
282 const unsigned char* inend= in+*inlen;
Daniel Veillard891e4041998-10-19 00:43:02 +0000283 unsigned int c, d, trailing;
Daniel Veillardcf461992000-03-14 18:30:20 +0000284#ifdef BIG_ENDIAN
285 unsigned char *tmp;
286 unsigned short tmp1, tmp2;
287#endif /* BIG_ENDIAN */
Daniel Veillard891e4041998-10-19 00:43:02 +0000288
Daniel Veillardcf461992000-03-14 18:30:20 +0000289 outlen /= 2; /* convert in short length */
290 outend = out + outlen;
Daniel Veillard891e4041998-10-19 00:43:02 +0000291 while (in < inend) {
292 d= *in++;
293 if (d < 0x80) { c= d; trailing= 0; }
Daniel Veillardcf461992000-03-14 18:30:20 +0000294 else if (d < 0xC0)
295 return(-2); /* trailing byte in leading position */
Daniel Veillard891e4041998-10-19 00:43:02 +0000296 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
297 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
298 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
Daniel Veillardcf461992000-03-14 18:30:20 +0000299 else
300 return(-2); /* no chance for this in UTF-16 */
301
302 if (inend - in < trailing) {
303 *inlen -= (inend - in);
304 break;
305 }
Daniel Veillard891e4041998-10-19 00:43:02 +0000306
307 for ( ; trailing; trailing--) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000308 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
309 return(-1);
Daniel Veillard891e4041998-10-19 00:43:02 +0000310 c <<= 6;
311 c |= d & 0x3F;
312 }
313
314 /* assertion: c is a single UTF-4 value */
315 if (c < 0x10000) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000316 if (out >= outend)
317 return(-1);
318#ifdef BIG_ENDIAN
319 tmp = (unsigned char *) out;
320 *tmp = c ;
321 *(tmp + 1) = c >> 8 ;
322 out++;
323#else /* BIG_ENDIAN */
Daniel Veillard891e4041998-10-19 00:43:02 +0000324 *out++ = c;
Daniel Veillardcf461992000-03-14 18:30:20 +0000325#endif /* BIG_ENDIAN */
Daniel Veillard891e4041998-10-19 00:43:02 +0000326 }
327 else if (c < 0x110000) {
Daniel Veillardcf461992000-03-14 18:30:20 +0000328 if (out+1 >= outend)
329 return(-1);
Daniel Veillard891e4041998-10-19 00:43:02 +0000330 c -= 0x10000;
Daniel Veillardcf461992000-03-14 18:30:20 +0000331#ifdef BIG_ENDIAN
332 tmp1 = 0xD800 | (c >> 10);
333 tmp = (unsigned char *) out;
334 *tmp = tmp1;
335 *(tmp + 1) = tmp1 >> 8;
336 out++;
337
338 tmp2 = 0xDC00 | (c & 0x03FF);
339 tmp = (unsigned char *) out;
340 *tmp = tmp2;
341 *(tmp + 1) = tmp2 >> 8;
342 out++;
343#else /* BIG_ENDIAN */
Daniel Veillard891e4041998-10-19 00:43:02 +0000344 *out++ = 0xD800 | (c >> 10);
345 *out++ = 0xDC00 | (c & 0x03FF);
Daniel Veillardcf461992000-03-14 18:30:20 +0000346#endif /* BIG_ENDIAN */
Daniel Veillard891e4041998-10-19 00:43:02 +0000347 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000348 else
349 return(-1);
Daniel Veillard891e4041998-10-19 00:43:02 +0000350 }
Daniel Veillardcf461992000-03-14 18:30:20 +0000351 return(out-outstart);
Daniel Veillard891e4041998-10-19 00:43:02 +0000352}
353
Daniel Veillardcf461992000-03-14 18:30:20 +0000354/**
355 * UTF16BEToUTF8:
356 * @out: a pointer to an array of bytes to store the result
357 * @outlen: the length of @out
358 * @inb: a pointer to an array of UTF-16 passwd as a byte array
359 * @inlenb: the length of @in in UTF-16 chars
360 *
361 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
362 * block of chars out. This function assume the endian properity
363 * is the same between the native type of this machine and the
364 * inputed one.
365 *
366 * Returns the number of byte written, or -1 by lack of space, or -2
367 * if the transcoding fails (for *in is not valid utf16 string)
368 * The value of *inlen after return is the number of octets consumed
369 * as the return value is positive, else unpredictiable.
370 */
371int
372UTF16BEToUTF8(unsigned char* out, int outlen,
373 const unsigned char* inb, int *inlenb)
374{
375 unsigned char* outstart= out;
376 unsigned char* outend= out+outlen;
377 unsigned short* in = (unsigned short*) inb;
378 unsigned short* inend;
379 unsigned int c, d, inlen;
380#ifdef BIG_ENDIAN
381#else /* BIG_ENDIAN */
382 unsigned char *tmp;
383#endif /* BIG_ENDIAN */
384 int bits;
385
386 if ((*inlenb % 2) == 1)
387 (*inlenb)--;
388 inlen = *inlenb / 2;
389 inend= in + inlen;
390 while (in < inend) {
391#ifdef BIG_ENDIAN
392 c= *in++;
393#else
394 tmp = (unsigned char *) in;
395 c = *tmp++;
396 c = c << 8;
397 c = c | (unsigned int) *tmp;
398 in++;
399#endif
400 if ((c & 0xFC00) == 0xD800) { /* surrogates */
401 if (in >= inend) { /* (in > inend) shouldn't happens */
402 (*inlenb) -= 2;
403 break;
404 }
405
406#ifdef BIG_ENDIAN
407 d= *in++;
408#else
409 tmp = (unsigned char *) in;
410 d = *tmp++;
411 d = d << 8;
412 d = d | (unsigned int) *tmp;
413 in++;
414#endif
415 if ((d & 0xFC00) == 0xDC00) {
416 c &= 0x03FF;
417 c <<= 10;
418 c |= d & 0x03FF;
419 c += 0x10000;
420 }
421 else
422 return(-2);
423 }
424
425 /* assertion: c is a single UTF-4 value */
426 if (out >= outend)
427 return(-1);
428 if (c < 0x80) { *out++= c; bits= -6; }
429 else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
430 else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
431 else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
432
433 for ( ; bits >= 0; bits-= 6) {
434 if (out >= outend)
435 return(-1);
436 *out++= ((c >> bits) & 0x3F) | 0x80;
437 }
438 }
439 return(out-outstart);
440}
441
442/**
443 * UTF8ToUTF16BE:
444 * @outb: a pointer to an array of bytes to store the result
445 * @outlen: the length of @outb
446 * @in: a pointer to an array of UTF-8 chars
447 * @inlen: the length of @in
448 *
449 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
450 * block of chars out.
451 * TODO: UTF8ToUTF16BE need a fallback mechanism ...
452 *
453 * Returns the number of byte written, or -1 by lack of space, or -2
454 * if the transcoding failed.
455 */
456int
457UTF8ToUTF16BE(unsigned char* outb, int outlen,
458 const unsigned char* in, int *inlen)
459{
460 unsigned short* out = (unsigned short*) outb;
461 unsigned short* outstart= out;
462 unsigned short* outend;
463 const unsigned char* inend= in+*inlen;
464 unsigned int c, d, trailing;
465#ifdef BIG_ENDIAN
466#else
467 unsigned char *tmp;
468 unsigned short tmp1, tmp2;
469#endif /* BIG_ENDIAN */
470
471 outlen /= 2; /* convert in short length */
472 outend = out + outlen;
473 while (in < inend) {
474 d= *in++;
475 if (d < 0x80) { c= d; trailing= 0; }
476 else if (d < 0xC0)
477 return(-2); /* trailing byte in leading position */
478 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
479 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
480 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
481 else
482 return(-2); /* no chance for this in UTF-16 */
483
484 if (inend - in < trailing) {
485 *inlen -= (inend - in);
486 break;
487 }
488
489 for ( ; trailing; trailing--) {
490 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return(-1);
491 c <<= 6;
492 c |= d & 0x3F;
493 }
494
495 /* assertion: c is a single UTF-4 value */
496 if (c < 0x10000) {
497 if (out >= outend) return(-1);
498#ifdef BIG_ENDIAN
499 *out++ = c;
500#else
501 tmp = (unsigned char *) out;
502 *tmp = c >> 8;
503 *(tmp + 1) = c;
504 out++;
505#endif /* BIG_ENDIAN */
506 }
507 else if (c < 0x110000) {
508 if (out+1 >= outend) return(-1);
509 c -= 0x10000;
510#ifdef BIG_ENDIAN
511 *out++ = 0xD800 | (c >> 10);
512 *out++ = 0xDC00 | (c & 0x03FF);
513#else
514 tmp1 = 0xD800 | (c >> 10);
515 tmp = (unsigned char *) out;
516 *tmp = tmp1 >> 8;
517 *(tmp + 1) = tmp1;
518 out++;
519
520 tmp2 = 0xDC00 | (c & 0x03FF);
521 tmp = (unsigned char *) out;
522 *tmp = tmp2 >> 8;
523 *(tmp + 1) = tmp2;
524 out++;
525#endif
526 }
527 else return(-1);
528 }
529 return(out-outstart);
530}
Daniel Veillard97b58771998-10-20 06:14:16 +0000531
Daniel Veillard27d88741999-05-29 11:51:49 +0000532/**
533 * xmlDetectCharEncoding:
534 * @in: a pointer to the first bytes of the XML entity, must be at least
535 * 4 bytes long.
Daniel Veillardcf461992000-03-14 18:30:20 +0000536 * @len: pointer to the length of the buffer
Daniel Veillard27d88741999-05-29 11:51:49 +0000537 *
538 * Guess the encoding of the entity using the first bytes of the entity content
539 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
540 *
541 * Returns one of the XML_CHAR_ENCODING_... values.
542 */
543xmlCharEncoding
Daniel Veillardcf461992000-03-14 18:30:20 +0000544xmlDetectCharEncoding(const unsigned char* in, int len)
Daniel Veillard27d88741999-05-29 11:51:49 +0000545{
Daniel Veillardcf461992000-03-14 18:30:20 +0000546 if (len >= 4) {
547 if ((in[0] == 0x00) && (in[1] == 0x00) &&
548 (in[2] == 0x00) && (in[3] == 0x3C))
549 return(XML_CHAR_ENCODING_UCS4BE);
550 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
551 (in[2] == 0x00) && (in[3] == 0x00))
552 return(XML_CHAR_ENCODING_UCS4LE);
553 if ((in[0] == 0x00) && (in[1] == 0x00) &&
554 (in[2] == 0x3C) && (in[3] == 0x00))
555 return(XML_CHAR_ENCODING_UCS4_2143);
556 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
557 (in[2] == 0x00) && (in[3] == 0x00))
558 return(XML_CHAR_ENCODING_UCS4_3412);
559 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
560 (in[2] == 0xA7) && (in[3] == 0x94))
561 return(XML_CHAR_ENCODING_EBCDIC);
562 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
563 (in[2] == 0x78) && (in[3] == 0x6D))
564 return(XML_CHAR_ENCODING_UTF8);
565 }
566 if (len >= 2) {
567 if ((in[0] == 0xFE) && (in[1] == 0xFF))
568 return(XML_CHAR_ENCODING_UTF16BE);
569 if ((in[0] == 0xFF) && (in[1] == 0xFE))
570 return(XML_CHAR_ENCODING_UTF16LE);
571 }
Daniel Veillard27d88741999-05-29 11:51:49 +0000572 return(XML_CHAR_ENCODING_NONE);
573}
574
575/**
576 * xmlParseCharEncoding:
Daniel Veillard7f858501999-11-17 17:32:38 +0000577 * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
Daniel Veillard27d88741999-05-29 11:51:49 +0000578 *
579 * Conpare the string to the known encoding schemes already known. Note
580 * that the comparison is case insensitive accordingly to the section
581 * [XML] 4.3.3 Character Encoding in Entities.
582 *
583 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
584 * if not recognized.
585 */
586xmlCharEncoding
Daniel Veillard011b63c1999-06-02 17:44:04 +0000587xmlParseCharEncoding(const char* name)
Daniel Veillard27d88741999-05-29 11:51:49 +0000588{
589 char upper[500];
590 int i;
591
592 for (i = 0;i < 499;i++) {
593 upper[i] = toupper(name[i]);
594 if (upper[i] == 0) break;
595 }
596 upper[i] = 0;
597
598 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
599 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
600 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
601
602 /*
603 * NOTE: if we were able to parse this, the endianness of UTF16 is
604 * already found and in use
605 */
606 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
607 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
608
609 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
610 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
611 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
612
613 /*
614 * NOTE: if we were able to parse this, the endianness of UCS4 is
615 * already found and in use
616 */
617 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
618 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
619 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
620
621
622 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
623 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
624 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
625
626 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
627 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
628 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
629
630 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
631 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
632 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
633 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
634 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
635 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
636 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
637
638 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
639 if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
640 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
641 return(XML_CHAR_ENCODING_ERROR);
642}
Daniel Veillard14fff061999-06-22 21:49:07 +0000643
644/****************************************************************
645 * *
646 * Char encoding handlers *
647 * *
648 ****************************************************************/
649
650/* the size should be growable, but it's not a big deal ... */
651#define MAX_ENCODING_HANDLERS 50
652static xmlCharEncodingHandlerPtr *handlers = NULL;
653static int nbCharEncodingHandler = 0;
654
655/*
656 * The default is UTF-8 for XML, that's also the default used for the
657 * parser internals, so the default encoding handler is NULL
658 */
659
660static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
661
662/**
663 * xmlNewCharEncodingHandler:
Daniel Veillard7f858501999-11-17 17:32:38 +0000664 * @name: the encoding name, in UTF-8 format (ASCII actually)
Daniel Veillard14fff061999-06-22 21:49:07 +0000665 * @input: the xmlCharEncodingInputFunc to read that encoding
666 * @output: the xmlCharEncodingOutputFunc to write that encoding
667 *
668 * Create and registers an xmlCharEncodingHandler.
669 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
670 */
671xmlCharEncodingHandlerPtr
Daniel Veillardcf461992000-03-14 18:30:20 +0000672xmlNewCharEncodingHandler(const char *name,
673 xmlCharEncodingInputFunc input,
Daniel Veillard14fff061999-06-22 21:49:07 +0000674 xmlCharEncodingOutputFunc output) {
675 xmlCharEncodingHandlerPtr handler;
676 char upper[500];
677 int i;
678 char *up = 0;
679
680 /*
681 * Keep only the uppercase version of the encoding.
682 */
683 if (name == NULL) {
684 fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
685 return(NULL);
686 }
687 for (i = 0;i < 499;i++) {
688 upper[i] = toupper(name[i]);
689 if (upper[i] == 0) break;
690 }
691 upper[i] = 0;
Daniel Veillard6454aec1999-09-02 22:04:43 +0000692 up = xmlMemStrdup(upper);
Daniel Veillard14fff061999-06-22 21:49:07 +0000693 if (up == NULL) {
694 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
695 return(NULL);
696 }
697
698 /*
699 * allocate and fill-up an handler block.
700 */
701 handler = (xmlCharEncodingHandlerPtr)
Daniel Veillard6454aec1999-09-02 22:04:43 +0000702 xmlMalloc(sizeof(xmlCharEncodingHandler));
Daniel Veillard14fff061999-06-22 21:49:07 +0000703 if (handler == NULL) {
704 fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
705 return(NULL);
706 }
707 handler->input = input;
708 handler->output = output;
709 handler->name = up;
710
711 /*
712 * registers and returns the handler.
713 */
714 xmlRegisterCharEncodingHandler(handler);
715 return(handler);
716}
717
718/**
719 * xmlInitCharEncodingHandlers:
720 *
721 * Initialize the char encoding support, it registers the default
722 * encoding supported.
Daniel Veillard7f858501999-11-17 17:32:38 +0000723 * NOTE: while public, this function usually doesn't need to be called
Daniel Veillard14fff061999-06-22 21:49:07 +0000724 * in normal processing.
725 */
726void
727xmlInitCharEncodingHandlers(void) {
728 if (handlers != NULL) return;
729
730 handlers = (xmlCharEncodingHandlerPtr *)
Daniel Veillard6454aec1999-09-02 22:04:43 +0000731 xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
Daniel Veillard14fff061999-06-22 21:49:07 +0000732
733 if (handlers == NULL) {
734 fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
735 return;
736 }
737 xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
Daniel Veillardcf461992000-03-14 18:30:20 +0000738 xmlUTF16LEHandler =
739 xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
740 xmlUTF16BEHandler =
741 xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
Daniel Veillard14fff061999-06-22 21:49:07 +0000742 xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
743}
744
745/**
Daniel Veillarda819dac1999-11-24 18:04:22 +0000746 * xmlCleanupCharEncodingHandlers:
747 *
748 * Cleanup the memory allocated for the char encoding support, it
749 * unregisters all the encoding handlers.
750 */
751void
752xmlCleanupCharEncodingHandlers(void) {
753 if (handlers == NULL) return;
754
755 for (;nbCharEncodingHandler > 0;) {
756 nbCharEncodingHandler--;
757 if (handlers[nbCharEncodingHandler] != NULL) {
758 xmlFree(handlers[nbCharEncodingHandler]->name);
759 xmlFree(handlers[nbCharEncodingHandler]);
760 }
761 }
762 xmlFree(handlers);
763 handlers = NULL;
764 nbCharEncodingHandler = 0;
765 xmlDefaultCharEncodingHandler = NULL;
766}
767
768/**
Daniel Veillard14fff061999-06-22 21:49:07 +0000769 * xmlRegisterCharEncodingHandler:
770 * @handler: the xmlCharEncodingHandlerPtr handler block
771 *
772 * Register the char encoding handler, surprizing, isn't it ?
773 */
774void
775xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
776 if (handlers == NULL) xmlInitCharEncodingHandlers();
777 if (handler == NULL) {
778 fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
779 return;
780 }
781
782 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
783 fprintf(stderr,
784 "xmlRegisterCharEncodingHandler: Too many handler registered\n");
785 fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
786 return;
787 }
788 handlers[nbCharEncodingHandler++] = handler;
789}
790
791/**
792 * xmlGetCharEncodingHandler:
793 * @enc: an xmlCharEncoding value.
794 *
795 * Search in the registrered set the handler able to read/write that encoding.
796 *
797 * Returns the handler or NULL if not found
798 */
799xmlCharEncodingHandlerPtr
800xmlGetCharEncodingHandler(xmlCharEncoding enc) {
801 if (handlers == NULL) xmlInitCharEncodingHandlers();
Daniel Veillardcf461992000-03-14 18:30:20 +0000802 switch (enc) {
803 case XML_CHAR_ENCODING_ERROR:
804 return(NULL);
805 case XML_CHAR_ENCODING_NONE:
806 return(NULL);
807 case XML_CHAR_ENCODING_UTF8:
808 return(NULL);
809 case XML_CHAR_ENCODING_UTF16LE:
810 return(xmlUTF16LEHandler);
811 case XML_CHAR_ENCODING_UTF16BE:
812 return(xmlUTF16BEHandler);
813 case XML_CHAR_ENCODING_EBCDIC:
814 return(NULL);
815 case XML_CHAR_ENCODING_UCS4LE:
816 return(NULL);
817 case XML_CHAR_ENCODING_UCS4BE:
818 return(NULL);
819 case XML_CHAR_ENCODING_UCS4_2143:
820 return(NULL);
821 case XML_CHAR_ENCODING_UCS4_3412:
822 return(NULL);
823 case XML_CHAR_ENCODING_UCS2:
824 return(NULL);
825 case XML_CHAR_ENCODING_8859_1:
826 return(NULL);
827 case XML_CHAR_ENCODING_8859_2:
828 return(NULL);
829 case XML_CHAR_ENCODING_8859_3:
830 return(NULL);
831 case XML_CHAR_ENCODING_8859_4:
832 return(NULL);
833 case XML_CHAR_ENCODING_8859_5:
834 return(NULL);
835 case XML_CHAR_ENCODING_8859_6:
836 return(NULL);
837 case XML_CHAR_ENCODING_8859_7:
838 return(NULL);
839 case XML_CHAR_ENCODING_8859_8:
840 return(NULL);
841 case XML_CHAR_ENCODING_8859_9:
842 return(NULL);
843 case XML_CHAR_ENCODING_2022_JP:
844 case XML_CHAR_ENCODING_SHIFT_JIS:
845 case XML_CHAR_ENCODING_EUC_JP:
846 return(NULL);
847 }
Daniel Veillard14fff061999-06-22 21:49:07 +0000848 return(NULL);
849}
850
851/**
852 * xmlGetCharEncodingHandler:
853 * @enc: a string describing the char encoding.
854 *
855 * Search in the registrered set the handler able to read/write that encoding.
856 *
857 * Returns the handler or NULL if not found
858 */
859xmlCharEncodingHandlerPtr
860xmlFindCharEncodingHandler(const char *name) {
861 char upper[500];
862 int i;
863
864 if (handlers == NULL) xmlInitCharEncodingHandlers();
865 if (name == NULL) return(xmlDefaultCharEncodingHandler);
866 if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
867
868 for (i = 0;i < 499;i++) {
869 upper[i] = toupper(name[i]);
870 if (upper[i] == 0) break;
871 }
872 upper[i] = 0;
873
874 for (i = 0;i < nbCharEncodingHandler; i++)
875 if (!strcmp(name, handlers[i]->name))
876 return(handlers[i]);
877
878 return(NULL);
879}
880