Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 1 | /* Copyright 2008,2009 Alain Knaff. |
| 2 | * This file is part of mtools. |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 3 | * |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 4 | * Mtools is free software: you can redistribute it and/or modify |
| 5 | * it under the terms of the GNU General Public License as published by |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 6 | * the Free Software Foundation, either version 3 of the License, or |
| 7 | * (at your option) any later version. |
| 8 | * |
| 9 | * Mtools is distributed in the hope that it will be useful, |
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | * GNU General Public License for more details. |
| 13 | * |
| 14 | * You should have received a copy of the GNU General Public License |
| 15 | * along with Mtools. If not, see <http://www.gnu.org/licenses/>. |
| 16 | * |
| 17 | * Various character set conversions used by mtools |
| 18 | */ |
| 19 | #include "sysincludes.h" |
| 20 | #include "msdos.h" |
| 21 | #include "mtools.h" |
| 22 | |
| 23 | #include <stdio.h> |
| 24 | #include <errno.h> |
| 25 | #include <stdlib.h> |
| 26 | #include "file_name.h" |
| 27 | |
| 28 | |
| 29 | #ifdef HAVE_ICONV_H |
| 30 | #include <iconv.h> |
| 31 | |
| 32 | struct doscp_t { |
| 33 | iconv_t from; |
| 34 | iconv_t to; |
| 35 | }; |
| 36 | |
| 37 | static const char *wcharCp=NULL; |
| 38 | |
| 39 | static const char* wcharTries[] = { |
| 40 | "WCHAR_T", |
| 41 | "UTF-32BE", "UTF-32LE", |
| 42 | "UTF-16BE", "UTF-16LE", |
| 43 | "UTF-32", "UTF-16", |
| 44 | "UCS-4BE", "UCS-4LE", |
| 45 | "UCS-2BE", "UCS-2LE", |
| 46 | "UCS-4", "UCS-2" |
| 47 | }; |
| 48 | |
| 49 | static const char *asciiTries[] = { |
| 50 | "ASCII", "ASCII-GR", "ISO8859-1" |
| 51 | }; |
| 52 | |
| 53 | static const wchar_t *testString = L"ab"; |
| 54 | |
| 55 | static int try(const char *testCp) { |
| 56 | size_t res; |
| 57 | char *inbuf = (char *)testString; |
| 58 | size_t inbufLen = 2*sizeof(wchar_t); |
| 59 | char outbuf[3]; |
| 60 | char *outbufP = outbuf; |
| 61 | size_t outbufLen = 2*sizeof(char); |
| 62 | iconv_t test = 0; |
| 63 | size_t i; |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 64 | |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 65 | for(i=0; i < sizeof(asciiTries) / sizeof(asciiTries[0]); i++) { |
| 66 | test = iconv_open(asciiTries[i], testCp); |
| 67 | if(test != (iconv_t) -1) |
| 68 | break; |
| 69 | } |
| 70 | if(test == (iconv_t) -1) |
| 71 | goto fail0; |
| 72 | res = iconv(test, |
| 73 | &inbuf, &inbufLen, |
| 74 | &outbufP, &outbufLen); |
| 75 | if(res != 0 || outbufLen != 0 || inbufLen != 0) |
| 76 | goto fail; |
| 77 | if(memcmp(outbuf, "ab", 2)) |
| 78 | goto fail; |
| 79 | /* fprintf(stderr, "%s ok\n", testCp); */ |
| 80 | return 1; |
| 81 | fail: |
| 82 | iconv_close(test); |
| 83 | fail0: |
| 84 | /*fprintf(stderr, "%s fail\n", testCp);*/ |
| 85 | return 0; |
| 86 | } |
| 87 | |
| 88 | static const char *getWcharCp(void) { |
| 89 | unsigned int i; |
| 90 | if(wcharCp != NULL) |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 91 | return wcharCp; |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 92 | for(i=0; i< sizeof(wcharTries) / sizeof(wcharTries[0]); i++) { |
| 93 | if(try(wcharTries[i])) |
| 94 | return (wcharCp=wcharTries[i]); |
| 95 | } |
| 96 | fprintf(stderr, "No codepage found for wchar_t\n"); |
| 97 | return NULL; |
| 98 | } |
| 99 | |
| 100 | |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 101 | doscp_t *cp_open(unsigned int codepage) |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 102 | { |
| 103 | char dosCp[17]; |
| 104 | doscp_t *ret; |
Alistair Delva | 8d7cf5d | 2021-04-13 07:59:20 -0700 | [diff] [blame] | 105 | iconv_t from; |
| 106 | iconv_t to; |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 107 | |
| 108 | if(codepage == 0) |
| 109 | codepage = mtools_default_codepage; |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 110 | if(codepage > 9999) { |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 111 | fprintf(stderr, "Bad codepage %d\n", codepage); |
| 112 | return NULL; |
| 113 | } |
| 114 | |
| 115 | if(getWcharCp() == NULL) |
| 116 | return NULL; |
| 117 | |
| 118 | sprintf(dosCp, "CP%d", codepage); |
| 119 | from = iconv_open(wcharCp, dosCp); |
| 120 | if(from == (iconv_t)-1) { |
| 121 | fprintf(stderr, "Error converting to codepage %d %s\n", |
| 122 | codepage, strerror(errno)); |
| 123 | return NULL; |
| 124 | } |
| 125 | |
| 126 | sprintf(dosCp, "CP%d//TRANSLIT", codepage); |
| 127 | to = iconv_open(dosCp, wcharCp); |
| 128 | if(to == (iconv_t)-1) { |
| 129 | /* Transliteration not supported? */ |
| 130 | sprintf(dosCp, "CP%d", codepage); |
| 131 | to = iconv_open(dosCp, wcharCp); |
| 132 | } |
| 133 | if(to == (iconv_t)-1) { |
| 134 | iconv_close(from); |
| 135 | fprintf(stderr, "Error converting to codepage %d %s\n", |
| 136 | codepage, strerror(errno)); |
| 137 | return NULL; |
| 138 | } |
| 139 | |
| 140 | ret = New(doscp_t); |
| 141 | if(ret == NULL) |
| 142 | return ret; |
| 143 | ret->from = from; |
| 144 | ret->to = to; |
| 145 | return ret; |
| 146 | } |
| 147 | |
| 148 | void cp_close(doscp_t *cp) |
| 149 | { |
| 150 | iconv_close(cp->to); |
| 151 | iconv_close(cp->from); |
| 152 | free(cp); |
| 153 | } |
| 154 | |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 155 | size_t dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len) |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 156 | { |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 157 | size_t r; |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 158 | size_t in_len=len; |
| 159 | size_t out_len=len*sizeof(wchar_t); |
| 160 | wchar_t *dptr=wchar; |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 161 | char *dos2 = (char *) dos; /* Magic to be able to call iconv with its |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 162 | buggy prototype */ |
| 163 | r=iconv(cp->from, &dos2, &in_len, (char **)&dptr, &out_len); |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 164 | if(r == (size_t) -1) |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 165 | return r; |
| 166 | *dptr = L'\0'; |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 167 | return (size_t) (dptr-wchar); |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 168 | } |
| 169 | |
| 170 | /** |
| 171 | * Converts len wide character to destination. Caller's responsibility to |
| 172 | * ensure that dest is large enough. |
| 173 | * mangled will be set if there has been an untranslatable character. |
| 174 | */ |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 175 | static size_t safe_iconv(iconv_t conv, const wchar_t *wchar, char *dest, |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 176 | size_t in_len, size_t out_len, int *mangled) |
| 177 | { |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 178 | size_t r; |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 179 | unsigned int i; |
| 180 | char *dptr = dest; |
| 181 | size_t len; |
| 182 | |
| 183 | in_len=in_len*sizeof(wchar_t); |
| 184 | |
| 185 | while(in_len > 0 && out_len > 0) { |
| 186 | r=iconv(conv, (char**)&wchar, &in_len, &dptr, &out_len); |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 187 | if(r == (size_t) -1 || errno != EILSEQ) { |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 188 | /* everything transformed, or error that is _not_ a bad |
| 189 | * character */ |
| 190 | break; |
| 191 | } |
| 192 | *mangled |= 1; |
| 193 | |
| 194 | if(out_len <= 0) |
| 195 | break; |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 196 | if(dptr) |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 197 | *dptr++ = '_'; |
| 198 | in_len -= sizeof(wchar_t); |
| 199 | |
| 200 | wchar++; |
| 201 | out_len--; |
| 202 | } |
| 203 | |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 204 | len = (size_t) (dptr-dest); /* how many dest characters have there been |
| 205 | generated */ |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 206 | |
| 207 | /* eliminate question marks which might have been formed by |
| 208 | untransliterable characters */ |
| 209 | for(i=0; i<len; i++) { |
| 210 | if(dest[i] == '?') { |
| 211 | dest[i] = '_'; |
| 212 | *mangled |= 1; |
| 213 | } |
| 214 | } |
| 215 | return len; |
| 216 | } |
| 217 | |
| 218 | void wchar_to_dos(doscp_t *cp, |
| 219 | wchar_t *wchar, char *dos, size_t len, int *mangled) |
| 220 | { |
| 221 | safe_iconv(cp->to, wchar, dos, len, len, mangled); |
| 222 | } |
| 223 | |
| 224 | #else |
| 225 | |
| 226 | #include "codepage.h" |
| 227 | |
| 228 | struct doscp_t { |
| 229 | unsigned char *from_dos; |
| 230 | unsigned char to_dos[0x80]; |
| 231 | }; |
| 232 | |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 233 | doscp_t *cp_open(unsigned int codepage) |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 234 | { |
| 235 | doscp_t *ret; |
| 236 | int i; |
| 237 | Codepage_t *cp; |
| 238 | |
| 239 | if(codepage == 0) |
| 240 | codepage = 850; |
| 241 | |
| 242 | ret = New(doscp_t); |
| 243 | if(ret == NULL) |
| 244 | return ret; |
| 245 | |
| 246 | for(cp=codepages; cp->nr ; cp++) |
| 247 | if(cp->nr == codepage) { |
| 248 | ret->from_dos = cp->tounix; |
| 249 | break; |
| 250 | } |
| 251 | |
| 252 | if(ret->from_dos == NULL) { |
| 253 | fprintf(stderr, "Bad codepage %d\n", codepage); |
| 254 | free(ret); |
| 255 | return NULL; |
| 256 | } |
| 257 | |
| 258 | for(i=0; i<0x80; i++) { |
| 259 | char native = ret->from_dos[i]; |
| 260 | if(! (native & 0x80)) |
| 261 | continue; |
| 262 | ret->to_dos[native & 0x7f] = 0x80 | i; |
| 263 | } |
| 264 | return ret; |
| 265 | } |
| 266 | |
| 267 | void cp_close(doscp_t *cp) |
| 268 | { |
| 269 | free(cp); |
| 270 | } |
| 271 | |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 272 | size_t dos_to_wchar(doscp_t *cp, const char *dos, wchar_t *wchar, size_t len) |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 273 | { |
| 274 | int i; |
| 275 | |
| 276 | for(i=0; i<len && dos[i]; i++) { |
| 277 | char c = dos[i]; |
| 278 | if(c >= ' ' && c <= '~') |
| 279 | wchar[i] = c; |
| 280 | else { |
| 281 | wchar[i] = cp->from_dos[c & 0x7f]; |
| 282 | } |
| 283 | } |
| 284 | wchar[i] = '\0'; |
| 285 | return i; |
| 286 | } |
| 287 | |
| 288 | |
| 289 | void wchar_to_dos(doscp_t *cp, |
| 290 | wchar_t *wchar, char *dos, size_t len, int *mangled) |
| 291 | { |
| 292 | int i; |
| 293 | for(i=0; i<len && wchar[i]; i++) { |
| 294 | char c = wchar[i]; |
| 295 | if(c >= ' ' && c <= '~') |
| 296 | dos[i] = c; |
| 297 | else { |
| 298 | dos[i] = cp->to_dos[c & 0x7f]; |
| 299 | if(dos[i] == '\0') { |
| 300 | dos[i]='_'; |
| 301 | *mangled=1; |
| 302 | } |
| 303 | } |
| 304 | } |
| 305 | } |
| 306 | |
| 307 | #endif |
| 308 | |
| 309 | |
| 310 | #ifndef HAVE_WCHAR_H |
| 311 | |
| 312 | typedef int mbstate_t; |
| 313 | |
| 314 | static inline size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps) |
| 315 | { |
| 316 | *s = wc; |
| 317 | return 1; |
| 318 | } |
| 319 | |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 320 | static inline size_t mbrtowc(wchar_t *pwc, const char *s, |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 321 | size_t n, mbstate_t *ps) |
| 322 | { |
| 323 | *pwc = *s; |
| 324 | return 1; |
| 325 | } |
| 326 | |
| 327 | #endif |
| 328 | |
| 329 | #ifdef HAVE_ICONV_H |
| 330 | |
| 331 | #include <langinfo.h> |
| 332 | |
| 333 | static iconv_t to_native = NULL; |
| 334 | |
| 335 | static void initialize_to_native(void) |
| 336 | { |
| 337 | char *li, *cp; |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 338 | size_t len; |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 339 | if(to_native != NULL) |
| 340 | return; |
| 341 | li = nl_langinfo(CODESET); |
| 342 | len = strlen(li) + 11; |
| 343 | if(getWcharCp() == NULL) |
| 344 | exit(1); |
| 345 | cp = safe_malloc(len); |
| 346 | strcpy(cp, li); |
| 347 | strcat(cp, "//TRANSLIT"); |
| 348 | to_native = iconv_open(cp, wcharCp); |
| 349 | if(to_native == (iconv_t) -1) |
| 350 | to_native = iconv_open(li, wcharCp); |
| 351 | if(to_native == (iconv_t) -1) |
| 352 | fprintf(stderr, "Could not allocate iconv for %s\n", cp); |
| 353 | free(cp); |
| 354 | if(to_native == (iconv_t) -1) |
| 355 | exit(1); |
| 356 | } |
| 357 | |
| 358 | |
| 359 | |
| 360 | #endif |
| 361 | |
| 362 | |
| 363 | /** |
| 364 | * Convert wchar string to native, converting at most len wchar characters |
| 365 | * Returns number of generated native characters |
| 366 | */ |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 367 | size_t wchar_to_native(const wchar_t *wchar, char *native, size_t len, |
| 368 | size_t out_len) |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 369 | { |
| 370 | #ifdef HAVE_ICONV_H |
| 371 | int mangled; |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 372 | size_t r; |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 373 | initialize_to_native(); |
| 374 | len = wcsnlen(wchar,len); |
| 375 | r=safe_iconv(to_native, wchar, native, len, out_len, &mangled); |
| 376 | native[r]='\0'; |
| 377 | return r; |
| 378 | #else |
| 379 | int i; |
| 380 | char *dptr = native; |
| 381 | mbstate_t ps; |
| 382 | memset(&ps, 0, sizeof(ps)); |
| 383 | for(i=0; i<len && wchar[i] != 0; i++) { |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 384 | size_t r = wcrtomb(dptr, wchar[i], &ps); |
| 385 | if(r == (size_t) -1 && errno == EILSEQ) { |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 386 | r=1; |
| 387 | *dptr='_'; |
| 388 | } |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 389 | dptr+=r; |
| 390 | } |
| 391 | *dptr='\0'; |
| 392 | return dptr-native; |
| 393 | #endif |
| 394 | } |
| 395 | |
| 396 | /** |
| 397 | * Convert native string to wchar string, generating at most len wchar |
| 398 | * characters. If end is supplied, stop conversion when source pointer |
| 399 | * exceeds end. Returns number of generated wchars |
| 400 | */ |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 401 | size_t native_to_wchar(const char *native, wchar_t *wchar, size_t len, |
| 402 | const char *end, int *mangled) |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 403 | { |
| 404 | mbstate_t ps; |
| 405 | unsigned int i; |
| 406 | memset(&ps, 0, sizeof(ps)); |
| 407 | |
| 408 | for(i=0; i<len && (native < end || !end); i++) { |
Yi Kong | 39bbd96 | 2022-01-09 19:41:38 +0800 | [diff] [blame] | 409 | size_t r = mbrtowc(wchar+i, native, len, &ps); |
| 410 | if(r == (size_t) -1) { |
Alistair Delva | beaee83 | 2021-02-24 11:27:23 -0800 | [diff] [blame] | 411 | /* Unconvertible character. Just pretend it's Latin1 |
| 412 | encoded (if valid Latin1 character) or substitute |
| 413 | with an underscore if not |
| 414 | */ |
| 415 | char c = *native; |
| 416 | if(c >= '\xa0' && c < '\xff') |
| 417 | wchar[i] = c & 0xff; |
| 418 | else |
| 419 | wchar[i] = '_'; |
| 420 | memset(&ps, 0, sizeof(ps)); |
| 421 | r=1; |
| 422 | } |
| 423 | if(r == 0) |
| 424 | break; |
| 425 | native += r; |
| 426 | } |
| 427 | if(mangled && ((end && native < end) || (!end && *native && i == len))) |
| 428 | *mangled |= 3; |
| 429 | wchar[i]='\0'; |
| 430 | return i; |
| 431 | } |
| 432 | |