blob: 295ca39a201d5dded0d79a53d81fc4c627f5c0f2 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
15 * Original code from "Martin J. Duerst" <duerst@w3.org>
16 *
17 * See Copyright for the status of this software.
18 *
Daniel Veillard891e4041998-10-19 00:43:02 +000019 * Daniel.Veillard@w3.org
20 */
21
22#include "encoding.h"
23
Daniel Veillard0ba4d531998-11-01 19:34:31 +000024/*
25 * From rfc2044: encoding of the Unicode values on UTF-8:
26 *
27 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
28 * 0000 0000-0000 007F 0xxxxxxx
29 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
30 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
31 *
32 * I hope we won't use values > 0xFFFF anytime soon !
33 */
34
Daniel Veillard97b58771998-10-20 06:14:16 +000035/**
36 * isolat1ToUTF8:
37 * @out: a pointer ot an array of bytes to store the result
38 * @outlen: the lenght of @out
39 * @in: a pointer ot an array of ISO Latin 1 chars
40 * @inlen: the lenght of @in
41 *
Daniel Veillard891e4041998-10-19 00:43:02 +000042 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
43 * block of chars out.
Daniel Veillard97b58771998-10-20 06:14:16 +000044 * return values: number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +000045 */
Daniel Veillard97b58771998-10-20 06:14:16 +000046int
47isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000048{
49 unsigned char* outstart= out;
50 unsigned char* outend= out+outlen;
51 unsigned char* inend= in+inlen;
52 unsigned char c;
53
54 while (in < inend) {
55 c= *in++;
56 if (c < 0x80) {
57 if (out >= outend) return -1;
58 *out++ = c;
59 }
60 else {
61 if (out >= outend) return -1;
62 *out++ = 0xC0 | (c >> 6);
63 if (out >= outend) return -1;
64 *out++ = 0x80 | (0x3F & c);
65 }
66 }
67 return out-outstart;
68}
69
Daniel Veillard97b58771998-10-20 06:14:16 +000070/**
71 * UTF8Toisolat1:
72 * @out: a pointer ot an array of bytes to store the result
73 * @outlen: the lenght of @out
74 * @in: a pointer ot an array of UTF-8 chars
75 * @inlen: the lenght of @in
76 *
Daniel Veillard891e4041998-10-19 00:43:02 +000077 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
78 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +000079 * TODO: need a fallback mechanism ...
Daniel Veillard97b58771998-10-20 06:14:16 +000080 * return values: the number of byte written, or -1 by lack of space, or -2
81 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +000082 */
Daniel Veillard97b58771998-10-20 06:14:16 +000083int
84UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000085{
86 unsigned char* outstart= out;
87 unsigned char* outend= out+outlen;
88 unsigned char* inend= in+inlen;
Daniel Veillardccb09631998-10-27 06:21:04 +000089 unsigned char c;
Daniel Veillard891e4041998-10-19 00:43:02 +000090
91 while (in < inend) {
92 c= *in++;
93 if (c < 0x80) {
94 if (out >= outend) return -1;
95 *out++= c;
96 }
97 else if (((c & 0xFE) == 0xC2) && in<inend) {
98 if (out >= outend) return -1;
99 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
100 }
101 else return -2;
102 }
103 return out-outstart;
104}
105
Daniel Veillard97b58771998-10-20 06:14:16 +0000106/**
107 * UTF16ToUTF8:
108 * @out: a pointer ot an array of bytes to store the result
109 * @outlen: the lenght of @out
110 * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
111 * @inlen: the lenght of @in
112 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000113 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
114 * block of chars out.
Daniel Veillard97b58771998-10-20 06:14:16 +0000115 * return values: number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000116 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000117int
118UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000119{
120 unsigned char* outstart= out;
121 unsigned char* outend= out+outlen;
122 unsigned short* inend= in+inlen;
123 unsigned int c, d;
124 int bits;
125
126 while (in < inend) {
127 c= *in++;
128 if ((c & 0xFC00) == 0xD800) { /* surrogates */
129 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
130 c &= 0x03FF;
131 c <<= 10;
132 c |= d & 0x03FF;
133 c += 0x10000;
134 }
135 else return -1;
136 }
137
138 /* assertion: c is a single UTF-4 value */
139
140 if (out >= outend) return -1;
141 if (c < 0x80) { *out++= c; bits= -6; }
142 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
143 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
144 else { *out++= (c >> 18) | 0xF0; bits= 12; }
145
146 for ( ; bits < 0; bits-= 6) {
147 if (out >= outend) return -1;
148 *out++= (c >> bits) & 0x3F;
149 }
150 }
151 return out-outstart;
152}
153
Daniel Veillard97b58771998-10-20 06:14:16 +0000154/**
155 * UTF8ToUTF16:
156 * @out: a pointer ot an array of shorts to store the result
157 * @outlen: the lenght of @out (number of shorts)
158 * @in: a pointer ot an array of UTF-8 chars
159 * @inlen: the lenght of @in
160 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000161 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
162 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +0000163 * TODO: need a fallback mechanism ...
Daniel Veillard97b58771998-10-20 06:14:16 +0000164 * return values: the number of byte written, or -1 by lack of space, or -2
165 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000166 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000167int
168UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000169{
170 unsigned short* outstart= out;
171 unsigned short* outend= out+outlen;
172 unsigned char* inend= in+inlen;
173 unsigned int c, d, trailing;
174
175 while (in < inend) {
176 d= *in++;
177 if (d < 0x80) { c= d; trailing= 0; }
178 else if (d < 0xC0) return -2; /* trailing byte in leading position */
179 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
180 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
181 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
182 else return -2; /* no chance for this in UTF-16 */
183
184 for ( ; trailing; trailing--) {
185 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
186 c <<= 6;
187 c |= d & 0x3F;
188 }
189
190 /* assertion: c is a single UTF-4 value */
191 if (c < 0x10000) {
192 if (out >= outend) return -1;
193 *out++ = c;
194 }
195 else if (c < 0x110000) {
196 if (out+1 >= outend) return -1;
197 c -= 0x10000;
198 *out++ = 0xD800 | (c >> 10);
199 *out++ = 0xDC00 | (c & 0x03FF);
200 }
201 else return -1;
202 }
203 return out-outstart;
204}
205
Daniel Veillard97b58771998-10-20 06:14:16 +0000206