blob: dfcaf8ef67c6b2bd691bfc06019644ed28ef1732 [file] [log] [blame]
Alistair Delvabeaee832021-02-24 11:27:23 -08001/* Copyright 2008,2009 Alain Knaff.
2 * This file is part of mtools.
Yi Kong39bbd962022-01-09 19:41:38 +08003 *
Alistair Delvabeaee832021-02-24 11:27:23 -08004 * Mtools is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
Yi Kong39bbd962022-01-09 19:41:38 +08006 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * Mtools is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Alistair Delvabeaee832021-02-24 11:27:23 -080011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with Mtools. If not, see <http://www.gnu.org/licenses/>.
16 *
17 * Various character set conversions used by mtools
18 */
19#include "sysincludes.h"
20#include "msdos.h"
21#include "mtools.h"
22
23#include <stdio.h>
24#include <errno.h>
25#include <stdlib.h>
26#include "file_name.h"
27
28
29#ifdef HAVE_ICONV_H
30#include <iconv.h>
31
32struct doscp_t {
33 iconv_t from;
34 iconv_t to;
35};
36
37static const char *wcharCp=NULL;
38
39static const char* wcharTries[] = {
40 "WCHAR_T",
41 "UTF-32BE", "UTF-32LE",
42 "UTF-16BE", "UTF-16LE",
43 "UTF-32", "UTF-16",
44 "UCS-4BE", "UCS-4LE",
45 "UCS-2BE", "UCS-2LE",
46 "UCS-4", "UCS-2"
47};
48
49static const char *asciiTries[] = {
50 "ASCII", "ASCII-GR", "ISO8859-1"
51};
52
53static const wchar_t *testString = L"ab";
54
55static int try(const char *testCp) {
56 size_t res;
57 char *inbuf = (char *)testString;
58 size_t inbufLen = 2*sizeof(wchar_t);
59 char outbuf[3];
60 char *outbufP = outbuf;
61 size_t outbufLen = 2*sizeof(char);
62 iconv_t test = 0;
63 size_t i;
Yi Kong39bbd962022-01-09 19:41:38 +080064
Alistair Delvabeaee832021-02-24 11:27:23 -080065 for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) {
66 test = iconv_open(asciiTries[i], testCp);
67 if(test != (iconv_t) -1)
68 break;
69 }
70 if(test == (iconv_t) -1)
71 goto fail0;
72 res = iconv(test,
73 &inbuf, &inbufLen,
74 &outbufP, &outbufLen);
75 if(res != 0 || outbufLen != 0 || inbufLen != 0)
76 goto fail;
77 if(memcmp(outbuf, "ab", 2))
78 goto fail;
79 /* fprintf(stderr, "%s ok\n", testCp); */
80 return 1;
81 fail:
82 iconv_close(test);
83 fail0:
84 /*fprintf(stderr, "%s fail\n", testCp);*/
85 return 0;
86}
87
88static const char *getWcharCp(void) {
89 unsigned int i;
90 if(wcharCp != NULL)
Yi Kong39bbd962022-01-09 19:41:38 +080091 return wcharCp;
Alistair Delvabeaee832021-02-24 11:27:23 -080092 for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
93 if(try(wcharTries[i]))
94 return (wcharCp=wcharTries[i]);
95 }
96 fprintf(stderr, "No codepage found for wchar_t\n");
97 return NULL;
98}
99
100
Yi Kong39bbd962022-01-09 19:41:38 +0800101doscp_t *cp_open(unsigned int codepage)
Alistair Delvabeaee832021-02-24 11:27:23 -0800102{
103 char dosCp[17];
104 doscp_t *ret;
Alistair Delva8d7cf5d2021-04-13 07:59:20 -0700105 iconv_t from;
106 iconv_t to;
Alistair Delvabeaee832021-02-24 11:27:23 -0800107
108 if(codepage == 0)
109 codepage = mtools_default_codepage;
Yi Kong39bbd962022-01-09 19:41:38 +0800110 if(codepage > 9999) {
Alistair Delvabeaee832021-02-24 11:27:23 -0800111 fprintf(stderr, "Bad codepage %d\n", codepage);
112 return NULL;
113 }
114
115 if(getWcharCp() == NULL)
116 return NULL;
117
118 sprintf(dosCp, "CP%d", codepage);
119 from = iconv_open(wcharCp, dosCp);
120 if(from == (iconv_t)-1) {
121 fprintf(stderr, "Error converting to codepage %d %s\n",
122 codepage, strerror(errno));
123 return NULL;
124 }
125
126 sprintf(dosCp, "CP%d//TRANSLIT", codepage);
127 to = iconv_open(dosCp, wcharCp);
128 if(to == (iconv_t)-1) {
129 /* Transliteration not supported? */
130 sprintf(dosCp, "CP%d", codepage);
131 to = iconv_open(dosCp, wcharCp);
132 }
133 if(to == (iconv_t)-1) {
134 iconv_close(from);
135 fprintf(stderr, "Error converting to codepage %d %s\n",
136 codepage, strerror(errno));
137 return NULL;
138 }
139
140 ret = New(doscp_t);
141 if(ret == NULL)
142 return ret;
143 ret->from = from;
144 ret->to = to;
145 return ret;
146}
147
148void cp_close(doscp_t *cp)
149{
150 iconv_close(cp->to);
151 iconv_close(cp->from);
152 free(cp);
153}
154
Yi Kong39bbd962022-01-09 19:41:38 +0800155size_t dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len)
Alistair Delvabeaee832021-02-24 11:27:23 -0800156{
Yi Kong39bbd962022-01-09 19:41:38 +0800157 size_t r;
Alistair Delvabeaee832021-02-24 11:27:23 -0800158 size_t in_len=len;
159 size_t out_len=len*sizeof(wchar_t);
160 wchar_t *dptr=wchar;
Yi Kong39bbd962022-01-09 19:41:38 +0800161 char *dos2 = (char *) dos; /* Magic to be able to call iconv with its
Alistair Delvabeaee832021-02-24 11:27:23 -0800162 buggy prototype */
163 r=iconv(cp->from, &dos2, &in_len, (char **)&dptr, &out_len);
Yi Kong39bbd962022-01-09 19:41:38 +0800164 if(r == (size_t) -1)
Alistair Delvabeaee832021-02-24 11:27:23 -0800165 return r;
166 *dptr = L'\0';
Yi Kong39bbd962022-01-09 19:41:38 +0800167 return (size_t) (dptr-wchar);
Alistair Delvabeaee832021-02-24 11:27:23 -0800168}
169
170/**
171 * Converts len wide character to destination. Caller's responsibility to
172 * ensure that dest is large enough.
173 * mangled will be set if there has been an untranslatable character.
174 */
Yi Kong39bbd962022-01-09 19:41:38 +0800175static size_t safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
Alistair Delvabeaee832021-02-24 11:27:23 -0800176 size_t in_len, size_t out_len, int *mangled)
177{
Yi Kong39bbd962022-01-09 19:41:38 +0800178 size_t r;
Alistair Delvabeaee832021-02-24 11:27:23 -0800179 unsigned int i;
180 char *dptr = dest;
181 size_t len;
182
183 in_len=in_len*sizeof(wchar_t);
184
185 while(in_len > 0 && out_len > 0) {
186 r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
Yi Kong39bbd962022-01-09 19:41:38 +0800187 if(r == (size_t) -1 || errno != EILSEQ) {
Alistair Delvabeaee832021-02-24 11:27:23 -0800188 /* everything transformed, or error that is _not_ a bad
189 * character */
190 break;
191 }
192 *mangled |= 1;
193
194 if(out_len <= 0)
195 break;
Yi Kong39bbd962022-01-09 19:41:38 +0800196 if(dptr)
Alistair Delvabeaee832021-02-24 11:27:23 -0800197 *dptr++ = '_';
198 in_len -= sizeof(wchar_t);
199
200 wchar++;
201 out_len--;
202 }
203
Yi Kong39bbd962022-01-09 19:41:38 +0800204 len = (size_t) (dptr-dest); /* how many dest characters have there been
205 generated */
Alistair Delvabeaee832021-02-24 11:27:23 -0800206
207 /* eliminate question marks which might have been formed by
208 untransliterable characters */
209 for(i=0; i<len; i++) {
210 if(dest[i] == '?') {
211 dest[i] = '_';
212 *mangled |= 1;
213 }
214 }
215 return len;
216}
217
218void wchar_to_dos(doscp_t *cp,
219 wchar_t *wchar, char *dos, size_t len, int *mangled)
220{
221 safe_iconv(cp->to, wchar, dos, len, len, mangled);
222}
223
224#else
225
226#include "codepage.h"
227
228struct doscp_t {
229 unsigned char *from_dos;
230 unsigned char to_dos[0x80];
231};
232
Yi Kong39bbd962022-01-09 19:41:38 +0800233doscp_t *cp_open(unsigned int codepage)
Alistair Delvabeaee832021-02-24 11:27:23 -0800234{
235 doscp_t *ret;
236 int i;
237 Codepage_t *cp;
238
239 if(codepage == 0)
240 codepage = 850;
241
242 ret = New(doscp_t);
243 if(ret == NULL)
244 return ret;
245
246 for(cp=codepages; cp->nr ; cp++)
247 if(cp->nr == codepage) {
248 ret->from_dos = cp->tounix;
249 break;
250 }
251
252 if(ret->from_dos == NULL) {
253 fprintf(stderr, "Bad codepage %d\n", codepage);
254 free(ret);
255 return NULL;
256 }
257
258 for(i=0; i<0x80; i++) {
259 char native = ret->from_dos[i];
260 if(! (native & 0x80))
261 continue;
262 ret->to_dos[native & 0x7f] = 0x80 | i;
263 }
264 return ret;
265}
266
267void cp_close(doscp_t *cp)
268{
269 free(cp);
270}
271
Yi Kong39bbd962022-01-09 19:41:38 +0800272size_t dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len)
Alistair Delvabeaee832021-02-24 11:27:23 -0800273{
274 int i;
275
276 for(i=0; i<len && dos[i]; i++) {
277 char c = dos[i];
278 if(c >= ' ' && c <= '~')
279 wchar[i] = c;
280 else {
281 wchar[i] = cp->from_dos[c & 0x7f];
282 }
283 }
284 wchar[i] = '\0';
285 return i;
286}
287
288
289void wchar_to_dos(doscp_t *cp,
290 wchar_t *wchar, char *dos, size_t len, int *mangled)
291{
292 int i;
293 for(i=0; i<len && wchar[i]; i++) {
294 char c = wchar[i];
295 if(c >= ' ' && c <= '~')
296 dos[i] = c;
297 else {
298 dos[i] = cp->to_dos[c & 0x7f];
299 if(dos[i] == '\0') {
300 dos[i]='_';
301 *mangled=1;
302 }
303 }
304 }
305}
306
307#endif
308
309
310#ifndef HAVE_WCHAR_H
311
312typedef int mbstate_t;
313
314static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
315{
316 *s = wc;
317 return 1;
318}
319
Yi Kong39bbd962022-01-09 19:41:38 +0800320static inline size_t mbrtowc(wchar_t *pwc, const char *s,
Alistair Delvabeaee832021-02-24 11:27:23 -0800321 size_t n, mbstate_t *ps)
322{
323 *pwc = *s;
324 return 1;
325}
326
327#endif
328
329#ifdef HAVE_ICONV_H
330
331#include <langinfo.h>
332
333static iconv_t to_native = NULL;
334
335static void initialize_to_native(void)
336{
337 char *li, *cp;
Yi Kong39bbd962022-01-09 19:41:38 +0800338 size_t len;
Alistair Delvabeaee832021-02-24 11:27:23 -0800339 if(to_native != NULL)
340 return;
341 li = nl_langinfo(CODESET);
342 len = strlen(li) + 11;
343 if(getWcharCp() == NULL)
344 exit(1);
345 cp = safe_malloc(len);
346 strcpy(cp, li);
347 strcat(cp, "//TRANSLIT");
348 to_native = iconv_open(cp, wcharCp);
349 if(to_native == (iconv_t) -1)
350 to_native = iconv_open(li, wcharCp);
351 if(to_native == (iconv_t) -1)
352 fprintf(stderr, "Could not allocate iconv for %s\n", cp);
353 free(cp);
354 if(to_native == (iconv_t) -1)
355 exit(1);
356}
357
358
359
360#endif
361
362
363/**
364 * Convert wchar string to native, converting at most len wchar characters
365 * Returns number of generated native characters
366 */
Yi Kong39bbd962022-01-09 19:41:38 +0800367size_t wchar_to_native(const wchar_t *wchar, char *native, size_t len,
368 size_t out_len)
Alistair Delvabeaee832021-02-24 11:27:23 -0800369{
370#ifdef HAVE_ICONV_H
371 int mangled;
Yi Kong39bbd962022-01-09 19:41:38 +0800372 size_t r;
Alistair Delvabeaee832021-02-24 11:27:23 -0800373 initialize_to_native();
374 len = wcsnlen(wchar,len);
375 r=safe_iconv(to_native, wchar, native, len, out_len, &mangled);
376 native[r]='\0';
377 return r;
378#else
379 int i;
380 char *dptr = native;
381 mbstate_t ps;
382 memset(&ps, 0, sizeof(ps));
383 for(i=0; i<len && wchar[i] != 0; i++) {
Yi Kong39bbd962022-01-09 19:41:38 +0800384 size_t r = wcrtomb(dptr, wchar[i], &ps);
385 if(r == (size_t) -1 && errno == EILSEQ) {
Alistair Delvabeaee832021-02-24 11:27:23 -0800386 r=1;
387 *dptr='_';
388 }
Alistair Delvabeaee832021-02-24 11:27:23 -0800389 dptr+=r;
390 }
391 *dptr='\0';
392 return dptr-native;
393#endif
394}
395
396/**
397 * Convert native string to wchar string, generating at most len wchar
398 * characters. If end is supplied, stop conversion when source pointer
399 * exceeds end. Returns number of generated wchars
400 */
Yi Kong39bbd962022-01-09 19:41:38 +0800401size_t native_to_wchar(const char *native, wchar_t *wchar, size_t len,
402 const char *end, int *mangled)
Alistair Delvabeaee832021-02-24 11:27:23 -0800403{
404 mbstate_t ps;
405 unsigned int i;
406 memset(&ps, 0, sizeof(ps));
407
408 for(i=0; i<len && (native < end || !end); i++) {
Yi Kong39bbd962022-01-09 19:41:38 +0800409 size_t r = mbrtowc(wchar+i, native, len, &ps);
410 if(r == (size_t) -1) {
Alistair Delvabeaee832021-02-24 11:27:23 -0800411 /* Unconvertible character. Just pretend it's Latin1
412 encoded (if valid Latin1 character) or substitute
413 with an underscore if not
414 */
415 char c = *native;
416 if(c >= '\xa0' && c < '\xff')
417 wchar[i] = c & 0xff;
418 else
419 wchar[i] = '_';
420 memset(&ps, 0, sizeof(ps));
421 r=1;
422 }
423 if(r == 0)
424 break;
425 native += r;
426 }
427 if(mangled && ((end && native < end) || (!end && *native && i == len)))
428 *mangled |= 3;
429 wchar[i]='\0';
430 return i;
431}
432