blob: 033e45e27c410b1e2f2606992f598ded96efddf4 [file] [log] [blame]
Daniel Veillard891e4041998-10-19 00:43:02 +00001/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * [ISO-10646] UTF-8 and UTF-16 in Annexes
7 * [ISO-8859-1] ISO Latin-1 characters codes.
8 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9 * Worldwide Character Encoding -- Version 1.0", Addison-
10 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11 * described in Unicode Technical Report #4.
12 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13 * Information Interchange, ANSI X3.4-1986.
14 *
15 * Original code from "Martin J. Duerst" <duerst@w3.org>
16 *
17 * See Copyright for the status of this software.
18 *
19 * $Id$
20 *
21 * Daniel.Veillard@w3.org
22 */
23
24#include "encoding.h"
25
26/*
27 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
28 * block of chars out.
29 *
30 * Returns the number of byte written, or -1 by lack of space.
31 */
32int isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
33{
34 unsigned char* outstart= out;
35 unsigned char* outend= out+outlen;
36 unsigned char* inend= in+inlen;
37 unsigned char c;
38
39 while (in < inend) {
40 c= *in++;
41 if (c < 0x80) {
42 if (out >= outend) return -1;
43 *out++ = c;
44 }
45 else {
46 if (out >= outend) return -1;
47 *out++ = 0xC0 | (c >> 6);
48 if (out >= outend) return -1;
49 *out++ = 0x80 | (0x3F & c);
50 }
51 }
52 return out-outstart;
53}
54
55
56/*
57 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
58 * block of chars out.
59 *
60 * Returns the number of byte written, or -1 by lack of space, or -2
61 * if the transcoding failed.
62 *
63 * TODO: need a fallback mechanism ...
64 */
65int UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
66{
67 unsigned char* outstart= out;
68 unsigned char* outend= out+outlen;
69 unsigned char* inend= in+inlen;
70 unsigned char c, d;
71
72 while (in < inend) {
73 c= *in++;
74 if (c < 0x80) {
75 if (out >= outend) return -1;
76 *out++= c;
77 }
78 else if (((c & 0xFE) == 0xC2) && in<inend) {
79 if (out >= outend) return -1;
80 *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
81 }
82 else return -2;
83 }
84 return out-outstart;
85}
86
87/*
88 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
89 * block of chars out.
90 *
91 * Returns the number of byte written, or -1 by lack of space.
92 */
93int UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
94{
95 unsigned char* outstart= out;
96 unsigned char* outend= out+outlen;
97 unsigned short* inend= in+inlen;
98 unsigned int c, d;
99 int bits;
100
101 while (in < inend) {
102 c= *in++;
103 if ((c & 0xFC00) == 0xD800) { /* surrogates */
104 if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
105 c &= 0x03FF;
106 c <<= 10;
107 c |= d & 0x03FF;
108 c += 0x10000;
109 }
110 else return -1;
111 }
112
113 /* assertion: c is a single UTF-4 value */
114
115 if (out >= outend) return -1;
116 if (c < 0x80) { *out++= c; bits= -6; }
117 else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
118 else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
119 else { *out++= (c >> 18) | 0xF0; bits= 12; }
120
121 for ( ; bits < 0; bits-= 6) {
122 if (out >= outend) return -1;
123 *out++= (c >> bits) & 0x3F;
124 }
125 }
126 return out-outstart;
127}
128
129/*
130 * Take a block of UTF-8 chars in and try to convert it to an UTF-16
131 * block of chars out.
132 *
133 * Returns the number of byte written, or -1 by lack of space, or -2
134 * if the transcoding failed.
135 *
136 * TODO: need a fallback mechanism ...
137 */
138int UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
139{
140 unsigned short* outstart= out;
141 unsigned short* outend= out+outlen;
142 unsigned char* inend= in+inlen;
143 unsigned int c, d, trailing;
144
145 while (in < inend) {
146 d= *in++;
147 if (d < 0x80) { c= d; trailing= 0; }
148 else if (d < 0xC0) return -2; /* trailing byte in leading position */
149 else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
150 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
151 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
152 else return -2; /* no chance for this in UTF-16 */
153
154 for ( ; trailing; trailing--) {
155 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
156 c <<= 6;
157 c |= d & 0x3F;
158 }
159
160 /* assertion: c is a single UTF-4 value */
161 if (c < 0x10000) {
162 if (out >= outend) return -1;
163 *out++ = c;
164 }
165 else if (c < 0x110000) {
166 if (out+1 >= outend) return -1;
167 c -= 0x10000;
168 *out++ = 0xD800 | (c >> 10);
169 *out++ = 0xDC00 | (c & 0x03FF);
170 }
171 else return -1;
172 }
173 return out-outstart;
174}
175