blob: 02ff02afb2fe6d49439ee79d47b0f2aca0003997 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
15 * Original code from "Martin J. Duerst" <duerst@w3.org>
16 *
17 * See Copyright for the status of this software.
18 *
19 * $Id$
20 *
21 * Daniel.Veillard@w3.org
22 */
23
24#include "encoding.h"
25
Daniel Veillard97b58771998-10-20 06:14:16 +000026/**
27 * isolat1ToUTF8:
28 * @out: a pointer ot an array of bytes to store the result
29 * @outlen: the lenght of @out
30 * @in: a pointer ot an array of ISO Latin 1 chars
31 * @inlen: the lenght of @in
32 *
Daniel Veillard891e4041998-10-19 00:43:02 +000033 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
34 * block of chars out.
Daniel Veillard97b58771998-10-20 06:14:16 +000035 * return values: number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +000036 */
Daniel Veillard97b58771998-10-20 06:14:16 +000037int
38isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000039{
40 unsigned char* outstart= out;
41 unsigned char* outend= out+outlen;
42 unsigned char* inend= in+inlen;
43 unsigned char c;
44
45 while (in < inend) {
46 c= *in++;
47 if (c < 0x80) {
48 if (out >= outend) return -1;
49 *out++ = c;
50 }
51 else {
52 if (out >= outend) return -1;
53 *out++ = 0xC0 | (c >> 6);
54 if (out >= outend) return -1;
55 *out++ = 0x80 | (0x3F & c);
56 }
57 }
58 return out-outstart;
59}
60
Daniel Veillard97b58771998-10-20 06:14:16 +000061/**
62 * UTF8Toisolat1:
63 * @out: a pointer ot an array of bytes to store the result
64 * @outlen: the lenght of @out
65 * @in: a pointer ot an array of UTF-8 chars
66 * @inlen: the lenght of @in
67 *
Daniel Veillard891e4041998-10-19 00:43:02 +000068 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
69 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +000070 * TODO: need a fallback mechanism ...
Daniel Veillard97b58771998-10-20 06:14:16 +000071 * return values: the number of byte written, or -1 by lack of space, or -2
72 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +000073 */
Daniel Veillard97b58771998-10-20 06:14:16 +000074int
75UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +000076{
77 unsigned char* outstart= out;
78 unsigned char* outend= out+outlen;
79 unsigned char* inend= in+inlen;
80 unsigned char c, d;
81
82 while (in < inend) {
83 c= *in++;
84 if (c < 0x80) {
85 if (out >= outend) return -1;
86 *out++= c;
87 }
88 else if (((c & 0xFE) == 0xC2) && in<inend) {
89 if (out >= outend) return -1;
90 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
91 }
92 else return -2;
93 }
94 return out-outstart;
95}
96
Daniel Veillard97b58771998-10-20 06:14:16 +000097/**
98 * UTF16ToUTF8:
99 * @out: a pointer ot an array of bytes to store the result
100 * @outlen: the lenght of @out
101 * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
102 * @inlen: the lenght of @in
103 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000104 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
105 * block of chars out.
Daniel Veillard97b58771998-10-20 06:14:16 +0000106 * return values: number of byte written, or -1 by lack of space.
Daniel Veillard891e4041998-10-19 00:43:02 +0000107 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000108int
109UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000110{
111 unsigned char* outstart= out;
112 unsigned char* outend= out+outlen;
113 unsigned short* inend= in+inlen;
114 unsigned int c, d;
115 int bits;
116
117 while (in < inend) {
118 c= *in++;
119 if ((c & 0xFC00) == 0xD800) { /* surrogates */
120 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
121 c &= 0x03FF;
122 c <<= 10;
123 c |= d & 0x03FF;
124 c += 0x10000;
125 }
126 else return -1;
127 }
128
129 /* assertion: c is a single UTF-4 value */
130
131 if (out >= outend) return -1;
132 if (c < 0x80) { *out++= c; bits= -6; }
133 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
134 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
135 else { *out++= (c >> 18) | 0xF0; bits= 12; }
136
137 for ( ; bits < 0; bits-= 6) {
138 if (out >= outend) return -1;
139 *out++= (c >> bits) & 0x3F;
140 }
141 }
142 return out-outstart;
143}
144
Daniel Veillard97b58771998-10-20 06:14:16 +0000145/**
146 * UTF8ToUTF16:
147 * @out: a pointer ot an array of shorts to store the result
148 * @outlen: the lenght of @out (number of shorts)
149 * @in: a pointer ot an array of UTF-8 chars
150 * @inlen: the lenght of @in
151 *
Daniel Veillard891e4041998-10-19 00:43:02 +0000152 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
153 * block of chars out.
Daniel Veillard891e4041998-10-19 00:43:02 +0000154 * TODO: need a fallback mechanism ...
Daniel Veillard97b58771998-10-20 06:14:16 +0000155 * return values: the number of byte written, or -1 by lack of space, or -2
156 * if the transcoding failed.
Daniel Veillard891e4041998-10-19 00:43:02 +0000157 */
Daniel Veillard97b58771998-10-20 06:14:16 +0000158int
159UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
Daniel Veillard891e4041998-10-19 00:43:02 +0000160{
161 unsigned short* outstart= out;
162 unsigned short* outend= out+outlen;
163 unsigned char* inend= in+inlen;
164 unsigned int c, d, trailing;
165
166 while (in < inend) {
167 d= *in++;
168 if (d < 0x80) { c= d; trailing= 0; }
169 else if (d < 0xC0) return -2; /* trailing byte in leading position */
170 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
171 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
172 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
173 else return -2; /* no chance for this in UTF-16 */
174
175 for ( ; trailing; trailing--) {
176 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
177 c <<= 6;
178 c |= d & 0x3F;
179 }
180
181 /* assertion: c is a single UTF-4 value */
182 if (c < 0x10000) {
183 if (out >= outend) return -1;
184 *out++ = c;
185 }
186 else if (c < 0x110000) {
187 if (out+1 >= outend) return -1;
188 c -= 0x10000;
189 *out++ = 0xD800 | (c >> 10);
190 *out++ = 0xDC00 | (c & 0x03FF);
191 }
192 else return -1;
193 }
194 return out-outstart;
195}
196
Daniel Veillard97b58771998-10-20 06:14:16 +0000197