blob: f1e5708db047f10def372eb37589c01f5f8bd6a3 [file] [log] [blame]
Josh Coalsonfda98fb2002-05-17 06:33:39 +00001/*
2 * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com>
3 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20/*
21 * Convert a string between UTF-8 and the locale's charset.
22 */
23
24#include <stdlib.h>
25#include <string.h>
26
27#include "utf8.h"
Josh Coalsonf8e6b092002-08-14 00:51:30 +000028#include "charset.h"
Josh Coalsonfda98fb2002-05-17 06:33:39 +000029
30
31#ifdef _WIN32
32
33 /* Thanks to Peter Harris <peter.harris@hummingbird.com> for this win32
34 * code.
35 */
36
37#include <stdio.h>
38#include <windows.h>
39
40static unsigned char *make_utf8_string(const wchar_t *unicode)
41{
42 int size = 0, index = 0, out_index = 0;
43 unsigned char *out;
44 unsigned short c;
45
46 /* first calculate the size of the target string */
47 c = unicode[index++];
48 while(c) {
49 if(c < 0x0080) {
50 size += 1;
51 } else if(c < 0x0800) {
52 size += 2;
53 } else {
54 size += 3;
55 }
56 c = unicode[index++];
57 }
58
59 out = malloc(size + 1);
60 if (out == NULL)
61 return NULL;
62 index = 0;
63
64 c = unicode[index++];
65 while(c)
66 {
67 if(c < 0x080) {
68 out[out_index++] = (unsigned char)c;
69 } else if(c < 0x800) {
70 out[out_index++] = 0xc0 | (c >> 6);
71 out[out_index++] = 0x80 | (c & 0x3f);
72 } else {
73 out[out_index++] = 0xe0 | (c >> 12);
74 out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
75 out[out_index++] = 0x80 | (c & 0x3f);
76 }
77 c = unicode[index++];
78 }
79 out[out_index] = 0x00;
80
81 return out;
82}
83
84static wchar_t *make_unicode_string(const unsigned char *utf8)
85{
86 int size = 0, index = 0, out_index = 0;
87 wchar_t *out;
88 unsigned char c;
89
90 /* first calculate the size of the target string */
91 c = utf8[index++];
92 while(c) {
93 if((c & 0x80) == 0) {
94 index += 0;
95 } else if((c & 0xe0) == 0xe0) {
96 index += 2;
97 } else {
98 index += 1;
99 }
100 size += 1;
101 c = utf8[index++];
102 }
103
104 out = malloc((size + 1) * sizeof(wchar_t));
105 if (out == NULL)
106 return NULL;
107 index = 0;
108
109 c = utf8[index++];
110 while(c)
111 {
112 if((c & 0x80) == 0) {
113 out[out_index++] = c;
114 } else if((c & 0xe0) == 0xe0) {
115 out[out_index] = (c & 0x1F) << 12;
116 c = utf8[index++];
117 out[out_index] |= (c & 0x3F) << 6;
118 c = utf8[index++];
119 out[out_index++] |= (c & 0x3F);
120 } else {
121 out[out_index] = (c & 0x3F) << 6;
122 c = utf8[index++];
123 out[out_index++] |= (c & 0x3F);
124 }
125 c = utf8[index++];
126 }
127 out[out_index] = 0;
128
129 return out;
130}
131
132int utf8_encode(const char *from, char **to)
133{
134 wchar_t *unicode;
135 int wchars, err;
136
137 wchars = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
138 strlen(from), NULL, 0);
139
140 if(wchars == 0)
141 {
142 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
143 return -1;
144 }
145
146 unicode = calloc(wchars + 1, sizeof(unsigned short));
147 if(unicode == NULL)
148 {
149 fprintf(stderr, "Out of memory processing string to UTF8\n");
150 return -1;
151 }
152
153 err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
154 strlen(from), unicode, wchars);
155 if(err != wchars)
156 {
157 free(unicode);
158 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
159 return -1;
160 }
161
162 /* On NT-based windows systems, we could use WideCharToMultiByte(), but
163 * MS doesn't actually have a consistent API across win32.
164 */
165 *to = make_utf8_string(unicode);
166
167 free(unicode);
168 return 0;
169}
170
171int utf8_decode(const char *from, char **to)
172{
173 wchar_t *unicode;
174 int chars, err;
175
176 /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but
177 * MS doesn't actually have a consistent API across win32.
178 */
179 unicode = make_unicode_string(from);
180 if(unicode == NULL)
181 {
182 fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16\n");
183 return -1;
184 }
185
186 chars = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
187 -1, NULL, 0, NULL, NULL);
188
189 if(chars == 0)
190 {
191 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
192 free(unicode);
193 return -1;
194 }
195
196 *to = calloc(chars + 1, sizeof(unsigned char));
197 if(*to == NULL)
198 {
199 fprintf(stderr, "Out of memory processing string to local charset\n");
200 free(unicode);
201 return -1;
202 }
203
204 err = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
205 -1, *to, chars, NULL, NULL);
206 if(err != chars)
207 {
208 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
209 free(unicode);
210 free(*to);
211 *to = NULL;
212 return -1;
213 }
214
215 free(unicode);
216 return 0;
217}
218
219#else /* End win32. Rest is for real operating systems */
220
221
222#ifdef HAVE_LANGINFO_CODESET
223#include <langinfo.h>
224#endif
225
226int iconvert(const char *fromcode, const char *tocode,
227 const char *from, size_t fromlen,
228 char **to, size_t *tolen);
229
230static char *current_charset = 0; /* means "US-ASCII" */
231
232void convert_set_charset(const char *charset)
233{
234
235#ifdef HAVE_LANGINFO_CODESET
236 if (!charset)
237 charset = nl_langinfo(CODESET);
238#endif
239
240 if (!charset)
241 charset = getenv("CHARSET");
242
243 free(current_charset);
244 current_charset = 0;
245 if (charset && *charset)
246 current_charset = strdup(charset);
247}
248
249static int convert_buffer(const char *fromcode, const char *tocode,
250 const char *from, size_t fromlen,
251 char **to, size_t *tolen)
252{
253 int ret = -1;
254
255#ifdef HAVE_ICONV
256 ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
257 if (ret != -1)
258 return ret;
259#endif
260
261#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
262 ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
263 if (ret != -1)
264 return ret;
265#endif
266
267 return ret;
268}
269
270static int convert_string(const char *fromcode, const char *tocode,
271 const char *from, char **to, char replace)
272{
273 int ret;
274 size_t fromlen;
275 char *s;
276
277 fromlen = strlen(from);
278 ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
279 if (ret == -2)
280 return -1;
281 if (ret != -1)
282 return ret;
283
284 s = malloc(fromlen + 1);
285 if (!s)
286 return -1;
287 strcpy(s, from);
288 *to = s;
289 for (; *s; s++)
290 if (*s & ~0x7f)
291 *s = replace;
292 return 3;
293}
294
295int utf8_encode(const char *from, char **to)
296{
297 char *charset;
298
299 if (!current_charset)
300 convert_set_charset(0);
301 charset = current_charset ? current_charset : "US-ASCII";
302 return convert_string(charset, "UTF-8", from, to, '#');
303}
304
305int utf8_decode(const char *from, char **to)
306{
307 char *charset;
308
309 if (!current_charset)
310 convert_set_charset(0);
311 charset = current_charset ? current_charset : "US-ASCII";
312 return convert_string("UTF-8", charset, from, to, '?');
313}
314
315#endif