blob: ce86fff70f7583ff5fe56568f4943028b666ef32 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
15 * Original code from "Martin J. Duerst" <duerst@w3.org>
16 *
17 * See Copyright for the status of this software.
18 *
19 * $Id$
20 *
21 * Daniel.Veillard@w3.org
22 */
23
24#include "encoding.h"
25
Daniel Veillard0ba4d531998-11-01 19:34:31 +000026/*
27 * From rfc2044: encoding of the Unicode values on UTF-8:
28 *
29 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
30 * 0000 0000-0000 007F 0xxxxxxx
31 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
32 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
33 *
34 * I hope we won't use values > 0xFFFF anytime soon !
35 */
36
Daniel Veillard97b58771998-10-20 06:14:16 +000037/**
38 * isolat1ToUTF8:
39 * @out: a pointer ot an array of bytes to store the result
40 * @outlen: the lenght of @out
41 * @in: a pointer ot an array of ISO Latin 1 chars
42 * @inlen: the lenght of @in
43 *
Daniel Veillard891e4041998-10-19 00:43:02 +000044 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
45 * block of chars out.
Daniel Veillard97b58771998-10-20 06:14:16 +000046 * return values: number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +000047 */
Daniel Veillard97b58771998-10-20 06:14:16 +000048int
49isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000050{
51 unsigned char* outstart= out;
52 unsigned char* outend= out+outlen;
53 unsigned char* inend= in+inlen;
54 unsigned char c;
55
56 while (in < inend) {
57 c= *in++;
58 if (c < 0x80) {
59 if (out >= outend) return -1;
60 *out++ = c;
61 }
62 else {
63 if (out >= outend) return -1;
64 *out++ = 0xC0 | (c >> 6);
65 if (out >= outend) return -1;
66 *out++ = 0x80 | (0x3F & c);
67 }
68 }
69 return out-outstart;
70}
71
Daniel Veillard97b58771998-10-20 06:14:16 +000072/**
73 * UTF8Toisolat1:
74 * @out: a pointer ot an array of bytes to store the result
75 * @outlen: the lenght of @out
76 * @in: a pointer ot an array of UTF-8 chars
77 * @inlen: the lenght of @in
78 *
Daniel Veillard891e4041998-10-19 00:43:02 +000079 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
80 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +000081 * TODO: need a fallback mechanism ...
Daniel Veillard97b58771998-10-20 06:14:16 +000082 * return values: the number of byte written, or -1 by lack of space, or -2
83 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +000084 */
Daniel Veillard97b58771998-10-20 06:14:16 +000085int
86UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000087{
88 unsigned char* outstart= out;
89 unsigned char* outend= out+outlen;
90 unsigned char* inend= in+inlen;
Daniel Veillardccb09631998-10-27 06:21:04 +000091 unsigned char c;
Daniel Veillard891e4041998-10-19 00:43:02 +000092
93 while (in < inend) {
94 c= *in++;
95 if (c < 0x80) {
96 if (out >= outend) return -1;
97 *out++= c;
98 }
99 else if (((c & 0xFE) == 0xC2) && in<inend) {
100 if (out >= outend) return -1;
101 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
102 }
103 else return -2;
104 }
105 return out-outstart;
106}
107
Daniel Veillard97b58771998-10-20 06:14:16 +0000108/**
109 * UTF16ToUTF8:
110 * @out: a pointer ot an array of bytes to store the result
111 * @outlen: the lenght of @out
112 * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
113 * @inlen: the lenght of @in
114 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000115 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
116 * block of chars out.
Daniel Veillard97b58771998-10-20 06:14:16 +0000117 * return values: number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000118 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000119int
120UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000121{
122 unsigned char* outstart= out;
123 unsigned char* outend= out+outlen;
124 unsigned short* inend= in+inlen;
125 unsigned int c, d;
126 int bits;
127
128 while (in < inend) {
129 c= *in++;
130 if ((c & 0xFC00) == 0xD800) { /* surrogates */
131 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
132 c &= 0x03FF;
133 c <<= 10;
134 c |= d & 0x03FF;
135 c += 0x10000;
136 }
137 else return -1;
138 }
139
140 /* assertion: c is a single UTF-4 value */
141
142 if (out >= outend) return -1;
143 if (c < 0x80) { *out++= c; bits= -6; }
144 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
145 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
146 else { *out++= (c >> 18) | 0xF0; bits= 12; }
147
148 for ( ; bits < 0; bits-= 6) {
149 if (out >= outend) return -1;
150 *out++= (c >> bits) & 0x3F;
151 }
152 }
153 return out-outstart;
154}
155
Daniel Veillard97b58771998-10-20 06:14:16 +0000156/**
157 * UTF8ToUTF16:
158 * @out: a pointer ot an array of shorts to store the result
159 * @outlen: the lenght of @out (number of shorts)
160 * @in: a pointer ot an array of UTF-8 chars
161 * @inlen: the lenght of @in
162 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000163 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
164 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +0000165 * TODO: need a fallback mechanism ...
Daniel Veillard97b58771998-10-20 06:14:16 +0000166 * return values: the number of byte written, or -1 by lack of space, or -2
167 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000168 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000169int
170UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000171{
172 unsigned short* outstart= out;
173 unsigned short* outend= out+outlen;
174 unsigned char* inend= in+inlen;
175 unsigned int c, d, trailing;
176
177 while (in < inend) {
178 d= *in++;
179 if (d < 0x80) { c= d; trailing= 0; }
180 else if (d < 0xC0) return -2; /* trailing byte in leading position */
181 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
182 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
183 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
184 else return -2; /* no chance for this in UTF-16 */
185
186 for ( ; trailing; trailing--) {
187 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
188 c <<= 6;
189 c |= d & 0x3F;
190 }
191
192 /* assertion: c is a single UTF-4 value */
193 if (c < 0x10000) {
194 if (out >= outend) return -1;
195 *out++ = c;
196 }
197 else if (c < 0x110000) {
198 if (out+1 >= outend) return -1;
199 c -= 0x10000;
200 *out++ = 0xD800 | (c >> 10);
201 *out++ = 0xDC00 | (c & 0x03FF);
202 }
203 else return -1;
204 }
205 return out-outstart;
206}
207
Daniel Veillard97b58771998-10-20 06:14:16 +0000208