blob: a7e984e050d1d5e67e7b2e4ebdde1c8a7727fb1e [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
15 * Original code from "Martin J. Duerst" <duerst@w3.org>
16 *
17 * See Copyright for the status of this software.
18 *
Daniel Veillard891e4041998-10-19 00:43:02 +000019 * Daniel.Veillard@w3.org
20 */
21
Daniel Veillard27d88741999-05-29 11:51:49 +000022#include <ctype.h>
Daniel Veillard891e4041998-10-19 00:43:02 +000023#include "encoding.h"
24
Daniel Veillard0ba4d531998-11-01 19:34:31 +000025/*
26 * From rfc2044: encoding of the Unicode values on UTF-8:
27 *
28 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
29 * 0000 0000-0000 007F 0xxxxxxx
30 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
31 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
32 *
33 * I hope we won't use values > 0xFFFF anytime soon !
34 */
35
Daniel Veillard97b58771998-10-20 06:14:16 +000036/**
37 * isolat1ToUTF8:
38 * @out: a pointer ot an array of bytes to store the result
39 * @outlen: the lenght of @out
40 * @in: a pointer ot an array of ISO Latin 1 chars
41 * @inlen: the lenght of @in
42 *
Daniel Veillard891e4041998-10-19 00:43:02 +000043 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
44 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +000045 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +000046 */
Daniel Veillard97b58771998-10-20 06:14:16 +000047int
48isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000049{
50 unsigned char* outstart= out;
51 unsigned char* outend= out+outlen;
52 unsigned char* inend= in+inlen;
53 unsigned char c;
54
55 while (in < inend) {
56 c= *in++;
57 if (c < 0x80) {
58 if (out >= outend) return -1;
59 *out++ = c;
60 }
61 else {
62 if (out >= outend) return -1;
63 *out++ = 0xC0 | (c >> 6);
64 if (out >= outend) return -1;
65 *out++ = 0x80 | (0x3F & c);
66 }
67 }
68 return out-outstart;
69}
70
Daniel Veillard97b58771998-10-20 06:14:16 +000071/**
72 * UTF8Toisolat1:
73 * @out: a pointer ot an array of bytes to store the result
74 * @outlen: the lenght of @out
75 * @in: a pointer ot an array of UTF-8 chars
76 * @inlen: the lenght of @in
77 *
Daniel Veillard891e4041998-10-19 00:43:02 +000078 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
79 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +000080 * TODO: need a fallback mechanism ...
Daniel Veillard1e346af1999-02-22 10:33:01 +000081 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +000082 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +000083 */
Daniel Veillard97b58771998-10-20 06:14:16 +000084int
85UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000086{
87 unsigned char* outstart= out;
88 unsigned char* outend= out+outlen;
89 unsigned char* inend= in+inlen;
Daniel Veillardccb09631998-10-27 06:21:04 +000090 unsigned char c;
Daniel Veillard891e4041998-10-19 00:43:02 +000091
92 while (in < inend) {
93 c= *in++;
94 if (c < 0x80) {
95 if (out >= outend) return -1;
96 *out++= c;
97 }
98 else if (((c & 0xFE) == 0xC2) && in<inend) {
99 if (out >= outend) return -1;
100 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
101 }
102 else return -2;
103 }
104 return out-outstart;
105}
106
Daniel Veillard97b58771998-10-20 06:14:16 +0000107/**
108 * UTF16ToUTF8:
109 * @out: a pointer ot an array of bytes to store the result
110 * @outlen: the lenght of @out
111 * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
112 * @inlen: the lenght of @in
113 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000114 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
115 * block of chars out.
Daniel Veillard1e346af1999-02-22 10:33:01 +0000116 * Returns the number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000117 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000118int
119UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000120{
121 unsigned char* outstart= out;
122 unsigned char* outend= out+outlen;
123 unsigned short* inend= in+inlen;
124 unsigned int c, d;
125 int bits;
126
127 while (in < inend) {
128 c= *in++;
129 if ((c & 0xFC00) == 0xD800) { /* surrogates */
130 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
131 c &= 0x03FF;
132 c <<= 10;
133 c |= d & 0x03FF;
134 c += 0x10000;
135 }
136 else return -1;
137 }
138
139 /* assertion: c is a single UTF-4 value */
140
141 if (out >= outend) return -1;
142 if (c < 0x80) { *out++= c; bits= -6; }
143 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
144 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
145 else { *out++= (c >> 18) | 0xF0; bits= 12; }
146
147 for ( ; bits < 0; bits-= 6) {
148 if (out >= outend) return -1;
149 *out++= (c >> bits) & 0x3F;
150 }
151 }
152 return out-outstart;
153}
154
Daniel Veillard97b58771998-10-20 06:14:16 +0000155/**
156 * UTF8ToUTF16:
157 * @out: a pointer ot an array of shorts to store the result
158 * @outlen: the lenght of @out (number of shorts)
159 * @in: a pointer ot an array of UTF-8 chars
160 * @inlen: the lenght of @in
161 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000162 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
163 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +0000164 * TODO: need a fallback mechanism ...
Daniel Veillard1e346af1999-02-22 10:33:01 +0000165 * Returns the number of byte written, or -1 by lack of space, or -2
Daniel Veillard97b58771998-10-20 06:14:16 +0000166 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000167 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000168int
169UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000170{
171 unsigned short* outstart= out;
172 unsigned short* outend= out+outlen;
173 unsigned char* inend= in+inlen;
174 unsigned int c, d, trailing;
175
176 while (in < inend) {
177 d= *in++;
178 if (d < 0x80) { c= d; trailing= 0; }
179 else if (d < 0xC0) return -2; /* trailing byte in leading position */
180 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
181 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
182 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
183 else return -2; /* no chance for this in UTF-16 */
184
185 for ( ; trailing; trailing--) {
186 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
187 c <<= 6;
188 c |= d & 0x3F;
189 }
190
191 /* assertion: c is a single UTF-4 value */
192 if (c < 0x10000) {
193 if (out >= outend) return -1;
194 *out++ = c;
195 }
196 else if (c < 0x110000) {
197 if (out+1 >= outend) return -1;
198 c -= 0x10000;
199 *out++ = 0xD800 | (c >> 10);
200 *out++ = 0xDC00 | (c & 0x03FF);
201 }
202 else return -1;
203 }
204 return out-outstart;
205}
206
Daniel Veillard97b58771998-10-20 06:14:16 +0000207
Daniel Veillard27d88741999-05-29 11:51:49 +0000208/**
209 * xmlDetectCharEncoding:
210 * @in: a pointer to the first bytes of the XML entity, must be at least
211 * 4 bytes long.
212 *
213 * Guess the encoding of the entity using the first bytes of the entity content
214 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
215 *
216 * Returns one of the XML_CHAR_ENCODING_... values.
217 */
218xmlCharEncoding
219xmlDetectCharEncoding(unsigned char* in)
220{
221 if ((in[0] == 0x00) && (in[1] == 0x00) &&
222 (in[2] == 0x00) && (in[3] == 0x3C))
223 return(XML_CHAR_ENCODING_UCS4BE);
224 if ((in[0] == 0x3C) && (in[1] == 0x00) &&
225 (in[2] == 0x00) && (in[3] == 0x00))
226 return(XML_CHAR_ENCODING_UCS4LE);
227 if ((in[0] == 0x00) && (in[1] == 0x00) &&
228 (in[2] == 0x3C) && (in[3] == 0x00))
229 return(XML_CHAR_ENCODING_UCS4_2143);
230 if ((in[0] == 0x00) && (in[1] == 0x3C) &&
231 (in[2] == 0x00) && (in[3] == 0x00))
232 return(XML_CHAR_ENCODING_UCS4_3412);
233 if ((in[0] == 0xFE) && (in[1] == 0xFF))
234 return(XML_CHAR_ENCODING_UTF16BE);
235 if ((in[0] == 0xFF) && (in[1] == 0xFE))
236 return(XML_CHAR_ENCODING_UTF16LE);
237 if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
238 (in[2] == 0xA7) && (in[3] == 0x94))
239 return(XML_CHAR_ENCODING_EBCDIC);
240 if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
241 (in[2] == 0x78) && (in[3] == 0x6D))
242 return(XML_CHAR_ENCODING_UTF8);
243 return(XML_CHAR_ENCODING_NONE);
244}
245
246/**
247 * xmlParseCharEncoding:
248 * @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
249 *
250 * Conpare the string to the known encoding schemes already known. Note
251 * that the comparison is case insensitive accordingly to the section
252 * [XML] 4.3.3 Character Encoding in Entities.
253 *
254 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
255 * if not recognized.
256 */
257xmlCharEncoding
258xmlParseCharEncoding(char* name)
259{
260 char upper[500];
261 int i;
262
263 for (i = 0;i < 499;i++) {
264 upper[i] = toupper(name[i]);
265 if (upper[i] == 0) break;
266 }
267 upper[i] = 0;
268
269 if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
270 if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
271 if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
272
273 /*
274 * NOTE: if we were able to parse this, the endianness of UTF16 is
275 * already found and in use
276 */
277 if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
278 if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
279
280 if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
281 if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
282 if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
283
284 /*
285 * NOTE: if we were able to parse this, the endianness of UCS4 is
286 * already found and in use
287 */
288 if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
289 if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
290 if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
291
292
293 if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
294 if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
295 if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
296
297 if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
298 if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
299 if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
300
301 if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
302 if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
303 if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
304 if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
305 if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
306 if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
307 if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
308
309 if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
310 if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
311 if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
312 return(XML_CHAR_ENCODING_ERROR);
313}