blob: 085d8878bd5ddb083c8f50c01b000e8721b45cb8 [file] [log] [blame]
Alistair Delvabeaee832021-02-24 11:27:23 -08001/* Copyright 2008,2009 Alain Knaff.
2 * This file is part of mtools.
3 *
4 * Mtools is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * Mtools is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with Mtools. If not, see <http://www.gnu.org/licenses/>.
16 *
17 * Various character set conversions used by mtools
18 */
19#include "sysincludes.h"
20#include "msdos.h"
21#include "mtools.h"
22
23#include <stdio.h>
24#include <errno.h>
25#include <stdlib.h>
26#include "file_name.h"
27
28
29#ifdef HAVE_ICONV_H
30#include <iconv.h>
31
32struct doscp_t {
33 iconv_t from;
34 iconv_t to;
35};
36
37static const char *wcharCp=NULL;
38
39static const char* wcharTries[] = {
40 "WCHAR_T",
41 "UTF-32BE", "UTF-32LE",
42 "UTF-16BE", "UTF-16LE",
43 "UTF-32", "UTF-16",
44 "UCS-4BE", "UCS-4LE",
45 "UCS-2BE", "UCS-2LE",
46 "UCS-4", "UCS-2"
47};
48
49static const char *asciiTries[] = {
50 "ASCII", "ASCII-GR", "ISO8859-1"
51};
52
53static const wchar_t *testString = L"ab";
54
55static int try(const char *testCp) {
56 size_t res;
57 char *inbuf = (char *)testString;
58 size_t inbufLen = 2*sizeof(wchar_t);
59 char outbuf[3];
60 char *outbufP = outbuf;
61 size_t outbufLen = 2*sizeof(char);
62 iconv_t test = 0;
63 size_t i;
64
65 for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) {
66 test = iconv_open(asciiTries[i], testCp);
67 if(test != (iconv_t) -1)
68 break;
69 }
70 if(test == (iconv_t) -1)
71 goto fail0;
72 res = iconv(test,
73 &inbuf, &inbufLen,
74 &outbufP, &outbufLen);
75 if(res != 0 || outbufLen != 0 || inbufLen != 0)
76 goto fail;
77 if(memcmp(outbuf, "ab", 2))
78 goto fail;
79 /* fprintf(stderr, "%s ok\n", testCp); */
80 return 1;
81 fail:
82 iconv_close(test);
83 fail0:
84 /*fprintf(stderr, "%s fail\n", testCp);*/
85 return 0;
86}
87
88static const char *getWcharCp(void) {
89 unsigned int i;
90 if(wcharCp != NULL)
91 return wcharCp;
92 for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) {
93 if(try(wcharTries[i]))
94 return (wcharCp=wcharTries[i]);
95 }
96 fprintf(stderr, "No codepage found for wchar_t\n");
97 return NULL;
98}
99
100
101doscp_t *cp_open(int codepage)
102{
103 char dosCp[17];
104 doscp_t *ret;
Alistair Delva8d7cf5d2021-04-13 07:59:20 -0700105 iconv_t from;
106 iconv_t to;
Alistair Delvabeaee832021-02-24 11:27:23 -0800107
108 if(codepage == 0)
109 codepage = mtools_default_codepage;
110 if(codepage < 0 || codepage > 9999) {
111 fprintf(stderr, "Bad codepage %d\n", codepage);
112 return NULL;
113 }
114
115 if(getWcharCp() == NULL)
116 return NULL;
117
118 sprintf(dosCp, "CP%d", codepage);
119 from = iconv_open(wcharCp, dosCp);
120 if(from == (iconv_t)-1) {
121 fprintf(stderr, "Error converting to codepage %d %s\n",
122 codepage, strerror(errno));
123 return NULL;
124 }
125
126 sprintf(dosCp, "CP%d//TRANSLIT", codepage);
127 to = iconv_open(dosCp, wcharCp);
128 if(to == (iconv_t)-1) {
129 /* Transliteration not supported? */
130 sprintf(dosCp, "CP%d", codepage);
131 to = iconv_open(dosCp, wcharCp);
132 }
133 if(to == (iconv_t)-1) {
134 iconv_close(from);
135 fprintf(stderr, "Error converting to codepage %d %s\n",
136 codepage, strerror(errno));
137 return NULL;
138 }
139
140 ret = New(doscp_t);
141 if(ret == NULL)
142 return ret;
143 ret->from = from;
144 ret->to = to;
145 return ret;
146}
147
148void cp_close(doscp_t *cp)
149{
150 iconv_close(cp->to);
151 iconv_close(cp->from);
152 free(cp);
153}
154
155int dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len)
156{
157 int r;
158 size_t in_len=len;
159 size_t out_len=len*sizeof(wchar_t);
160 wchar_t *dptr=wchar;
161 char *dos2 = (char *) dos; /* Magic to be able to call iconv with its
162 buggy prototype */
163 r=iconv(cp->from, &dos2, &in_len, (char **)&dptr, &out_len);
164 if(r < 0)
165 return r;
166 *dptr = L'\0';
167 return dptr-wchar;
168}
169
170/**
171 * Converts len wide character to destination. Caller's responsibility to
172 * ensure that dest is large enough.
173 * mangled will be set if there has been an untranslatable character.
174 */
175static int safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest,
176 size_t in_len, size_t out_len, int *mangled)
177{
178 int r;
179 unsigned int i;
180 char *dptr = dest;
181 size_t len;
182
183 in_len=in_len*sizeof(wchar_t);
184
185 while(in_len > 0 && out_len > 0) {
186 r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len);
187 if(r >= 0 || errno != EILSEQ) {
188 /* everything transformed, or error that is _not_ a bad
189 * character */
190 break;
191 }
192 *mangled |= 1;
193
194 if(out_len <= 0)
195 break;
196 if(dptr)
197 *dptr++ = '_';
198 in_len -= sizeof(wchar_t);
199
200 wchar++;
201 out_len--;
202 }
203
204 len = dptr-dest; /* how many dest characters have there been
205 generated */
206
207 /* eliminate question marks which might have been formed by
208 untransliterable characters */
209 for(i=0; i<len; i++) {
210 if(dest[i] == '?') {
211 dest[i] = '_';
212 *mangled |= 1;
213 }
214 }
215 return len;
216}
217
218void wchar_to_dos(doscp_t *cp,
219 wchar_t *wchar, char *dos, size_t len, int *mangled)
220{
221 safe_iconv(cp->to, wchar, dos, len, len, mangled);
222}
223
224#else
225
226#include "codepage.h"
227
228struct doscp_t {
229 unsigned char *from_dos;
230 unsigned char to_dos[0x80];
231};
232
233doscp_t *cp_open(int codepage)
234{
235 doscp_t *ret;
236 int i;
237 Codepage_t *cp;
238
239 if(codepage == 0)
240 codepage = 850;
241
242 ret = New(doscp_t);
243 if(ret == NULL)
244 return ret;
245
246 for(cp=codepages; cp->nr ; cp++)
247 if(cp->nr == codepage) {
248 ret->from_dos = cp->tounix;
249 break;
250 }
251
252 if(ret->from_dos == NULL) {
253 fprintf(stderr, "Bad codepage %d\n", codepage);
254 free(ret);
255 return NULL;
256 }
257
258 for(i=0; i<0x80; i++) {
259 char native = ret->from_dos[i];
260 if(! (native & 0x80))
261 continue;
262 ret->to_dos[native & 0x7f] = 0x80 | i;
263 }
264 return ret;
265}
266
267void cp_close(doscp_t *cp)
268{
269 free(cp);
270}
271
272int dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len)
273{
274 int i;
275
276 for(i=0; i<len && dos[i]; i++) {
277 char c = dos[i];
278 if(c >= ' ' && c <= '~')
279 wchar[i] = c;
280 else {
281 wchar[i] = cp->from_dos[c & 0x7f];
282 }
283 }
284 wchar[i] = '\0';
285 return i;
286}
287
288
289void wchar_to_dos(doscp_t *cp,
290 wchar_t *wchar, char *dos, size_t len, int *mangled)
291{
292 int i;
293 for(i=0; i<len && wchar[i]; i++) {
294 char c = wchar[i];
295 if(c >= ' ' && c <= '~')
296 dos[i] = c;
297 else {
298 dos[i] = cp->to_dos[c & 0x7f];
299 if(dos[i] == '\0') {
300 dos[i]='_';
301 *mangled=1;
302 }
303 }
304 }
305}
306
307#endif
308
309
310#ifndef HAVE_WCHAR_H
311
312typedef int mbstate_t;
313
314static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps)
315{
316 *s = wc;
317 return 1;
318}
319
320static inline size_t mbrtowc(wchar_t *pwc, const char *s,
321 size_t n, mbstate_t *ps)
322{
323 *pwc = *s;
324 return 1;
325}
326
327#endif
328
329#ifdef HAVE_ICONV_H
330
331#include <langinfo.h>
332
333static iconv_t to_native = NULL;
334
335static void initialize_to_native(void)
336{
337 char *li, *cp;
338 int len;
339 if(to_native != NULL)
340 return;
341 li = nl_langinfo(CODESET);
342 len = strlen(li) + 11;
343 if(getWcharCp() == NULL)
344 exit(1);
345 cp = safe_malloc(len);
346 strcpy(cp, li);
347 strcat(cp, "//TRANSLIT");
348 to_native = iconv_open(cp, wcharCp);
349 if(to_native == (iconv_t) -1)
350 to_native = iconv_open(li, wcharCp);
351 if(to_native == (iconv_t) -1)
352 fprintf(stderr, "Could not allocate iconv for %s\n", cp);
353 free(cp);
354 if(to_native == (iconv_t) -1)
355 exit(1);
356}
357
358
359
360#endif
361
362
363/**
364 * Convert wchar string to native, converting at most len wchar characters
365 * Returns number of generated native characters
366 */
367int wchar_to_native(const wchar_t *wchar, char *native, size_t len,
368 size_t out_len)
369{
370#ifdef HAVE_ICONV_H
371 int mangled;
372 int r;
373 initialize_to_native();
374 len = wcsnlen(wchar,len);
375 r=safe_iconv(to_native, wchar, native, len, out_len, &mangled);
376 native[r]='\0';
377 return r;
378#else
379 int i;
380 char *dptr = native;
381 mbstate_t ps;
382 memset(&ps, 0, sizeof(ps));
383 for(i=0; i<len && wchar[i] != 0; i++) {
384 int r = wcrtomb(dptr, wchar[i], &ps);
385 if(r < 0 && errno == EILSEQ) {
386 r=1;
387 *dptr='_';
388 }
389 if(r < 0)
390 return r;
391 dptr+=r;
392 }
393 *dptr='\0';
394 return dptr-native;
395#endif
396}
397
398/**
399 * Convert native string to wchar string, generating at most len wchar
400 * characters. If end is supplied, stop conversion when source pointer
401 * exceeds end. Returns number of generated wchars
402 */
403int native_to_wchar(const char *native, wchar_t *wchar, size_t len,
404 const char *end, int *mangled)
405{
406 mbstate_t ps;
407 unsigned int i;
408 memset(&ps, 0, sizeof(ps));
409
410 for(i=0; i<len && (native < end || !end); i++) {
411 int r = mbrtowc(wchar+i, native, len, &ps);
412 if(r < 0) {
413 /* Unconvertible character. Just pretend it's Latin1
414 encoded (if valid Latin1 character) or substitute
415 with an underscore if not
416 */
417 char c = *native;
418 if(c >= '\xa0' && c < '\xff')
419 wchar[i] = c & 0xff;
420 else
421 wchar[i] = '_';
422 memset(&ps, 0, sizeof(ps));
423 r=1;
424 }
425 if(r == 0)
426 break;
427 native += r;
428 }
429 if(mangled && ((end && native < end) || (!end && *native && i == len)))
430 *mangled |= 3;
431 wchar[i]='\0';
432 return i;
433}
434