blob: ad8b840ef7ea60d23d544224e14ae272271ae407 [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
2
3#ifdef HAVE_STAT
4
5/* Decode a byte string from the locale encoding with the
6 surrogateescape error handler (undecodable bytes are decoded as characters
7 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
8 character, escape the bytes using the surrogateescape error handler instead
9 of decoding them.
10
11 Use _Py_wchar2char() to encode the character string back to a byte string.
12
13 Return a pointer to a newly allocated (wide) character string (use
14 PyMem_Free() to free the memory), or NULL on error (conversion error or
15 memory error). */
16wchar_t*
17_Py_char2wchar(char* arg)
18{
19 wchar_t *res;
20#ifdef HAVE_BROKEN_MBSTOWCS
21 /* Some platforms have a broken implementation of
22 * mbstowcs which does not count the characters that
23 * would result from conversion. Use an upper bound.
24 */
25 size_t argsize = strlen(arg);
26#else
27 size_t argsize = mbstowcs(NULL, arg, 0);
28#endif
29 size_t count;
30 unsigned char *in;
31 wchar_t *out;
32#ifdef HAVE_MBRTOWC
33 mbstate_t mbs;
34#endif
35 if (argsize != (size_t)-1) {
36 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
37 if (!res)
38 goto oom;
39 count = mbstowcs(res, arg, argsize+1);
40 if (count != (size_t)-1) {
41 wchar_t *tmp;
42 /* Only use the result if it contains no
43 surrogate characters. */
44 for (tmp = res; *tmp != 0 &&
45 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
46 ;
47 if (*tmp == 0)
48 return res;
49 }
50 PyMem_Free(res);
51 }
52 /* Conversion failed. Fall back to escaping with surrogateescape. */
53#ifdef HAVE_MBRTOWC
54 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
55
56 /* Overallocate; as multi-byte characters are in the argument, the
57 actual output could use less memory. */
58 argsize = strlen(arg) + 1;
59 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
60 if (!res) goto oom;
61 in = (unsigned char*)arg;
62 out = res;
63 memset(&mbs, 0, sizeof mbs);
64 while (argsize) {
65 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
66 if (converted == 0)
67 /* Reached end of string; null char stored. */
68 break;
69 if (converted == (size_t)-2) {
70 /* Incomplete character. This should never happen,
71 since we provide everything that we have -
72 unless there is a bug in the C library, or I
73 misunderstood how mbrtowc works. */
74 fprintf(stderr, "unexpected mbrtowc result -2\n");
75 return NULL;
76 }
77 if (converted == (size_t)-1) {
78 /* Conversion error. Escape as UTF-8b, and start over
79 in the initial shift state. */
80 *out++ = 0xdc00 + *in++;
81 argsize--;
82 memset(&mbs, 0, sizeof mbs);
83 continue;
84 }
85 if (*out >= 0xd800 && *out <= 0xdfff) {
86 /* Surrogate character. Escape the original
87 byte sequence with surrogateescape. */
88 argsize -= converted;
89 while (converted--)
90 *out++ = 0xdc00 + *in++;
91 continue;
92 }
93 /* successfully converted some bytes */
94 in += converted;
95 argsize -= converted;
96 out++;
97 }
98#else
99 /* Cannot use C locale for escaping; manually escape as if charset
100 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
101 correctly in the locale's charset, which must be an ASCII superset. */
102 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
103 if (!res) goto oom;
104 in = (unsigned char*)arg;
105 out = res;
106 while(*in)
107 if(*in < 128)
108 *out++ = *in++;
109 else
110 *out++ = 0xdc00 + *in++;
111 *out = 0;
112#endif
113 return res;
114oom:
115 fprintf(stderr, "out of memory\n");
116 return NULL;
117}
118
119/* Encode a (wide) character string to the locale encoding with the
120 surrogateescape error handler (characters in range U+DC80..U+DCFF are
121 converted to bytes 0x80..0xFF).
122
123 This function is the reverse of _Py_char2wchar().
124
125 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
126 the memory), or NULL on error (conversion error or memory error). */
127char*
128_Py_wchar2char(const wchar_t *text)
129{
130 const size_t len = wcslen(text);
131 char *result = NULL, *bytes = NULL;
132 size_t i, size, converted;
133 wchar_t c, buf[2];
134
135 /* The function works in two steps:
136 1. compute the length of the output buffer in bytes (size)
137 2. outputs the bytes */
138 size = 0;
139 buf[1] = 0;
140 while (1) {
141 for (i=0; i < len; i++) {
142 c = text[i];
143 if (c >= 0xdc80 && c <= 0xdcff) {
144 /* UTF-8b surrogate */
145 if (bytes != NULL) {
146 *bytes++ = c - 0xdc00;
147 size--;
148 }
149 else
150 size++;
151 continue;
152 }
153 else {
154 buf[0] = c;
155 if (bytes != NULL)
156 converted = wcstombs(bytes, buf, size);
157 else
158 converted = wcstombs(NULL, buf, 0);
159 if (converted == (size_t)-1) {
160 if (result != NULL)
161 PyMem_Free(result);
162 return NULL;
163 }
164 if (bytes != NULL) {
165 bytes += converted;
166 size -= converted;
167 }
168 else
169 size += converted;
170 }
171 }
172 if (result != NULL) {
173 *bytes = 0;
174 break;
175 }
176
177 size += 1; /* nul byte at the end */
178 result = PyMem_Malloc(size);
179 if (result == NULL)
180 return NULL;
181 bytes = result;
182 }
183 return result;
184}
185
186#if defined(MS_WINDOWS) || defined(HAVE_STAT)
187int
188_Py_wstat(const wchar_t* path, struct stat *buf)
189{
190/* In principle, this should use HAVE__WSTAT, and _wstat
191 should be detected by autoconf. However, no current
192 POSIX system provides that function, so testing for
193 it is pointless.
194 Not sure whether the MS_WINDOWS guards are necessary:
195 perhaps for cygwin/mingw builds?
196*/
197#ifdef MS_WINDOWS
198 return _wstat(path, buf);
199#else
200 int err;
201 char *fname;
202 fname = _Py_wchar2char(path);
203 if (fname == NULL) {
204 errno = EINVAL;
205 return -1;
206 }
207 err = stat(fname, buf);
208 PyMem_Free(fname);
209 return err;
210#endif
211}
212#endif
213
214/* Call _wstat() on Windows, or stat() otherwise. Only fill st_mode
215 attribute on Windows. Return 0 on success, -1 on stat error or (if
216 PyErr_Occurred()) unicode error. */
217
218int
219_Py_stat(PyObject *unicode, struct stat *statbuf)
220{
221#ifdef MS_WINDOWS
222 wchar_t *path;
223 int err;
224 struct _stat wstatbuf;
225
226 path = PyUnicode_AsWideCharString(unicode, NULL);
227 if (path == NULL)
228 return -1;
229 err = _wstat(path, &wstatbuf);
230 PyMem_Free(path);
231 if (!err)
232 statbuf->st_mode = wstatbuf.st_mode;
233 return err;
234#else
235 int ret;
236 PyObject *bytes = PyUnicode_EncodeFSDefault(unicode);
237 if (bytes == NULL)
238 return -1;
239 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
240 Py_DECREF(bytes);
241 return ret;
242#endif
243}
244
245FILE *
246_Py_wfopen(const wchar_t *path, const wchar_t *mode)
247{
248#ifndef MS_WINDOWS
249 FILE *f;
250 char *cpath;
251 char cmode[10];
252 size_t r;
253 r = wcstombs(cmode, mode, 10);
254 if (r == (size_t)-1 || r >= 10) {
255 errno = EINVAL;
256 return NULL;
257 }
258 cpath = _Py_wchar2char(path);
259 if (cpath == NULL)
260 return NULL;
261 f = fopen(cpath, cmode);
262 PyMem_Free(cpath);
263 return f;
264#else
265 return _wfopen(path, mode);
266#endif
267}
268
269/* Call _wfopen() on Windows, or fopen() otherwise. Return the new file
270 object on success, or NULL if the file cannot be open or (if
271 PyErr_Occurred()) on unicode error */
272
273FILE*
274_Py_fopen(PyObject *unicode, const char *mode)
275{
276#ifdef MS_WINDOWS
277 wchar_t *path;
278 wchar_t wmode[10];
279 int usize;
280 FILE *f;
281
282 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
283 if (usize == 0)
284 return NULL;
285
286 path = PyUnicode_AsWideCharString(unicode, NULL);
287 if (path == NULL)
288 return NULL;
289 f = _wfopen(path, wmode);
290 PyMem_Free(path);
291 return f;
292#else
293 FILE *f;
294 PyObject *bytes = PyUnicode_EncodeFSDefault(unicode);
295 if (bytes == NULL)
296 return NULL;
297 f = fopen(PyBytes_AS_STRING(bytes), mode);
298 Py_DECREF(bytes);
299 return f;
300#endif
301}
302
303#ifdef HAVE_READLINK
304int
305_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
306{
307 char *cpath;
308 char cbuf[PATH_MAX];
309 int res;
310 size_t r1;
311
312 cpath = _Py_wchar2char(path);
313 if (cpath == NULL) {
314 errno = EINVAL;
315 return -1;
316 }
317 res = (int)readlink(cpath, cbuf, PATH_MAX);
318 PyMem_Free(cpath);
319 if (res == -1)
320 return -1;
321 if (res == PATH_MAX) {
322 errno = EINVAL;
323 return -1;
324 }
325 cbuf[res] = '\0'; /* buf will be null terminated */
326 r1 = mbstowcs(buf, cbuf, bufsiz);
327 if (r1 == -1) {
328 errno = EINVAL;
329 return -1;
330 }
331 return (int)r1;
332}
333#endif
334
335#ifdef HAVE_REALPATH
336wchar_t*
337_Py_wrealpath(const wchar_t *path, wchar_t *resolved_path)
338{
339 char *cpath;
340 char cresolved_path[PATH_MAX];
341 char *res;
342 size_t r;
343 cpath = _Py_wchar2char(path);
344 if (cpath == NULL) {
345 errno = EINVAL;
346 return NULL;
347 }
348 res = realpath(cpath, cresolved_path);
349 PyMem_Free(cpath);
350 if (res == NULL)
351 return NULL;
352 r = mbstowcs(resolved_path, cresolved_path, PATH_MAX);
353 if (r == (size_t)-1 || r >= PATH_MAX) {
354 errno = EINVAL;
355 return NULL;
356 }
357 return resolved_path;
358}
359#endif
360
361wchar_t*
362_Py_wgetcwd(wchar_t *buf, size_t size)
363{
364#ifdef MS_WINDOWS
365 return _wgetcwd(buf, size);
366#else
367 char fname[PATH_MAX];
368 if (getcwd(fname, PATH_MAX) == NULL)
369 return NULL;
370 if (mbstowcs(buf, fname, size) >= size) {
371 errno = ERANGE;
372 return NULL;
373 }
374 return buf;
375#endif
376}
377
378#endif
379
380#include "Python.h"
381
382#ifdef HAVE_STAT
383
384/* Decode a byte string from the locale encoding with the
385 surrogateescape error handler (undecodable bytes are decoded as characters
386 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
387 character, escape the bytes using the surrogateescape error handler instead
388 of decoding them.
389
390 Use _Py_wchar2char() to encode the character string back to a byte string.
391
392 Return a pointer to a newly allocated (wide) character string (use
393 PyMem_Free() to free the memory), or NULL on error (conversion error or
394 memory error). */
395wchar_t*
396_Py_char2wchar(char* arg)
397{
398 wchar_t *res;
399#ifdef HAVE_BROKEN_MBSTOWCS
400 /* Some platforms have a broken implementation of
401 * mbstowcs which does not count the characters that
402 * would result from conversion. Use an upper bound.
403 */
404 size_t argsize = strlen(arg);
405#else
406 size_t argsize = mbstowcs(NULL, arg, 0);
407#endif
408 size_t count;
409 unsigned char *in;
410 wchar_t *out;
411#ifdef HAVE_MBRTOWC
412 mbstate_t mbs;
413#endif
414 if (argsize != (size_t)-1) {
415 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
416 if (!res)
417 goto oom;
418 count = mbstowcs(res, arg, argsize+1);
419 if (count != (size_t)-1) {
420 wchar_t *tmp;
421 /* Only use the result if it contains no
422 surrogate characters. */
423 for (tmp = res; *tmp != 0 &&
424 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
425 ;
426 if (*tmp == 0)
427 return res;
428 }
429 PyMem_Free(res);
430 }
431 /* Conversion failed. Fall back to escaping with surrogateescape. */
432#ifdef HAVE_MBRTOWC
433 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
434
435 /* Overallocate; as multi-byte characters are in the argument, the
436 actual output could use less memory. */
437 argsize = strlen(arg) + 1;
438 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
439 if (!res) goto oom;
440 in = (unsigned char*)arg;
441 out = res;
442 memset(&mbs, 0, sizeof mbs);
443 while (argsize) {
444 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
445 if (converted == 0)
446 /* Reached end of string; null char stored. */
447 break;
448 if (converted == (size_t)-2) {
449 /* Incomplete character. This should never happen,
450 since we provide everything that we have -
451 unless there is a bug in the C library, or I
452 misunderstood how mbrtowc works. */
453 fprintf(stderr, "unexpected mbrtowc result -2\n");
454 return NULL;
455 }
456 if (converted == (size_t)-1) {
457 /* Conversion error. Escape as UTF-8b, and start over
458 in the initial shift state. */
459 *out++ = 0xdc00 + *in++;
460 argsize--;
461 memset(&mbs, 0, sizeof mbs);
462 continue;
463 }
464 if (*out >= 0xd800 && *out <= 0xdfff) {
465 /* Surrogate character. Escape the original
466 byte sequence with surrogateescape. */
467 argsize -= converted;
468 while (converted--)
469 *out++ = 0xdc00 + *in++;
470 continue;
471 }
472 /* successfully converted some bytes */
473 in += converted;
474 argsize -= converted;
475 out++;
476 }
477#else
478 /* Cannot use C locale for escaping; manually escape as if charset
479 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
480 correctly in the locale's charset, which must be an ASCII superset. */
481 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
482 if (!res) goto oom;
483 in = (unsigned char*)arg;
484 out = res;
485 while(*in)
486 if(*in < 128)
487 *out++ = *in++;
488 else
489 *out++ = 0xdc00 + *in++;
490 *out = 0;
491#endif
492 return res;
493oom:
494 fprintf(stderr, "out of memory\n");
495 return NULL;
496}
497
498/* Encode a (wide) character string to the locale encoding with the
499 surrogateescape error handler (characters in range U+DC80..U+DCFF are
500 converted to bytes 0x80..0xFF).
501
502 This function is the reverse of _Py_char2wchar().
503
504 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
505 the memory), or NULL on error (conversion error or memory error). */
506char*
507_Py_wchar2char(const wchar_t *text)
508{
509 const size_t len = wcslen(text);
510 char *result = NULL, *bytes = NULL;
511 size_t i, size, converted;
512 wchar_t c, buf[2];
513
514 /* The function works in two steps:
515 1. compute the length of the output buffer in bytes (size)
516 2. outputs the bytes */
517 size = 0;
518 buf[1] = 0;
519 while (1) {
520 for (i=0; i < len; i++) {
521 c = text[i];
522 if (c >= 0xdc80 && c <= 0xdcff) {
523 /* UTF-8b surrogate */
524 if (bytes != NULL) {
525 *bytes++ = c - 0xdc00;
526 size--;
527 }
528 else
529 size++;
530 continue;
531 }
532 else {
533 buf[0] = c;
534 if (bytes != NULL)
535 converted = wcstombs(bytes, buf, size);
536 else
537 converted = wcstombs(NULL, buf, 0);
538 if (converted == (size_t)-1) {
539 if (result != NULL)
540 PyMem_Free(result);
541 return NULL;
542 }
543 if (bytes != NULL) {
544 bytes += converted;
545 size -= converted;
546 }
547 else
548 size += converted;
549 }
550 }
551 if (result != NULL) {
552 *bytes = 0;
553 break;
554 }
555
556 size += 1; /* nul byte at the end */
557 result = PyMem_Malloc(size);
558 if (result == NULL)
559 return NULL;
560 bytes = result;
561 }
562 return result;
563}
564
565#if defined(MS_WINDOWS) || defined(HAVE_STAT)
566int
567_Py_wstat(const wchar_t* path, struct stat *buf)
568{
569/* In principle, this should use HAVE__WSTAT, and _wstat
570 should be detected by autoconf. However, no current
571 POSIX system provides that function, so testing for
572 it is pointless.
573 Not sure whether the MS_WINDOWS guards are necessary:
574 perhaps for cygwin/mingw builds?
575*/
576#ifdef MS_WINDOWS
577 return _wstat(path, buf);
578#else
579 int err;
580 char *fname;
581 fname = _Py_wchar2char(path);
582 if (fname == NULL) {
583 errno = EINVAL;
584 return -1;
585 }
586 err = stat(fname, buf);
587 PyMem_Free(fname);
588 return err;
589#endif
590}
591#endif
592
593/* Call _wstat() on Windows, or stat() otherwise. Only fill st_mode
594 attribute on Windows. Return 0 on success, -1 on stat error or (if
595 PyErr_Occurred()) unicode error. */
596
597int
598_Py_stat(PyObject *unicode, struct stat *statbuf)
599{
600#ifdef MS_WINDOWS
601 wchar_t *path;
602 int err;
603 struct _stat wstatbuf;
604
605 path = PyUnicode_AsWideCharString(unicode, NULL);
606 if (path == NULL)
607 return -1;
608 err = _wstat(path, &wstatbuf);
609 PyMem_Free(path);
610 if (!err)
611 statbuf->st_mode = wstatbuf.st_mode;
612 return err;
613#else
614 int ret;
615 PyObject *bytes = PyUnicode_EncodeFSDefault(unicode);
616 if (bytes == NULL)
617 return -1;
618 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
619 Py_DECREF(bytes);
620 return ret;
621#endif
622}
623
624FILE *
625_Py_wfopen(const wchar_t *path, const wchar_t *mode)
626{
627#ifndef MS_WINDOWS
628 FILE *f;
629 char *cpath;
630 char cmode[10];
631 size_t r;
632 r = wcstombs(cmode, mode, 10);
633 if (r == (size_t)-1 || r >= 10) {
634 errno = EINVAL;
635 return NULL;
636 }
637 cpath = _Py_wchar2char(path);
638 if (cpath == NULL)
639 return NULL;
640 f = fopen(cpath, cmode);
641 PyMem_Free(cpath);
642 return f;
643#else
644 return _wfopen(path, mode);
645#endif
646}
647
648/* Call _wfopen() on Windows, or fopen() otherwise. Return the new file
649 object on success, or NULL if the file cannot be open or (if
650 PyErr_Occurred()) on unicode error */
651
652FILE*
653_Py_fopen(PyObject *unicode, const char *mode)
654{
655#ifdef MS_WINDOWS
656 wchar_t *path;
657 wchar_t wmode[10];
658 int usize;
659 FILE *f;
660
661 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
662 if (usize == 0)
663 return NULL;
664
665 path = PyUnicode_AsWideCharString(unicode, NULL);
666 if (path == NULL)
667 return NULL;
668 f = _wfopen(path, wmode);
669 PyMem_Free(path);
670 return f;
671#else
672 FILE *f;
673 PyObject *bytes = PyUnicode_EncodeFSDefault(unicode);
674 if (bytes == NULL)
675 return NULL;
676 f = fopen(PyBytes_AS_STRING(bytes), mode);
677 Py_DECREF(bytes);
678 return f;
679#endif
680}
681
682#ifdef HAVE_READLINK
683int
684_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
685{
686 char *cpath;
687 char cbuf[PATH_MAX];
688 int res;
689 size_t r1;
690
691 cpath = _Py_wchar2char(path);
692 if (cpath == NULL) {
693 errno = EINVAL;
694 return -1;
695 }
696 res = (int)readlink(cpath, cbuf, PATH_MAX);
697 PyMem_Free(cpath);
698 if (res == -1)
699 return -1;
700 if (res == PATH_MAX) {
701 errno = EINVAL;
702 return -1;
703 }
704 cbuf[res] = '\0'; /* buf will be null terminated */
705 r1 = mbstowcs(buf, cbuf, bufsiz);
706 if (r1 == -1) {
707 errno = EINVAL;
708 return -1;
709 }
710 return (int)r1;
711}
712#endif
713
714#ifdef HAVE_REALPATH
715wchar_t*
716_Py_wrealpath(const wchar_t *path, wchar_t *resolved_path)
717{
718 char *cpath;
719 char cresolved_path[PATH_MAX];
720 char *res;
721 size_t r;
722 cpath = _Py_wchar2char(path);
723 if (cpath == NULL) {
724 errno = EINVAL;
725 return NULL;
726 }
727 res = realpath(cpath, cresolved_path);
728 PyMem_Free(cpath);
729 if (res == NULL)
730 return NULL;
731 r = mbstowcs(resolved_path, cresolved_path, PATH_MAX);
732 if (r == (size_t)-1 || r >= PATH_MAX) {
733 errno = EINVAL;
734 return NULL;
735 }
736 return resolved_path;
737}
738#endif
739
740wchar_t*
741_Py_wgetcwd(wchar_t *buf, size_t size)
742{
743#ifdef MS_WINDOWS
744 return _wgetcwd(buf, size);
745#else
746 char fname[PATH_MAX];
747 if (getcwd(fname, PATH_MAX) == NULL)
748 return NULL;
749 if (mbstowcs(buf, fname, size) >= size) {
750 errno = ERANGE;
751 return NULL;
752 }
753 return buf;
754#endif
755}
756
757#endif
758