blob: 94ab8e42514dc866b87703ab50cc45764a9e2f37 [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
Victor Stinner27b1ca22012-12-03 12:47:59 +01006#ifdef __APPLE__
7extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
8#endif
9
Victor Stinner4e314432010-10-07 21:45:39 +000010#ifdef HAVE_STAT
11
12/* Decode a byte string from the locale encoding with the
13 surrogateescape error handler (undecodable bytes are decoded as characters
14 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
15 character, escape the bytes using the surrogateescape error handler instead
16 of decoding them.
17
18 Use _Py_wchar2char() to encode the character string back to a byte string.
19
Victor Stinner168e1172010-10-16 23:16:16 +000020 Return a pointer to a newly allocated wide character string (use
21 PyMem_Free() to free the memory) and write the number of written wide
22 characters excluding the null character into *size if size is not NULL, or
Victor Stinner19de4c32010-11-08 23:30:46 +000023 NULL on error (conversion or memory allocation error).
24
25 Conversion errors should never happen, unless there is a bug in the C
26 library. */
Victor Stinner4e314432010-10-07 21:45:39 +000027wchar_t*
Victor Stinner168e1172010-10-16 23:16:16 +000028_Py_char2wchar(const char* arg, size_t *size)
Victor Stinner4e314432010-10-07 21:45:39 +000029{
Victor Stinner27b1ca22012-12-03 12:47:59 +010030#ifdef __APPLE__
31 wchar_t *wstr;
32 wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
33 if (size != NULL) {
34 if (wstr != NULL)
35 *size = wcslen(wstr);
36 else
37 *size = (size_t)-1;
38 }
39 return wstr;
40#else
Victor Stinner4e314432010-10-07 21:45:39 +000041 wchar_t *res;
42#ifdef HAVE_BROKEN_MBSTOWCS
43 /* Some platforms have a broken implementation of
44 * mbstowcs which does not count the characters that
45 * would result from conversion. Use an upper bound.
46 */
47 size_t argsize = strlen(arg);
48#else
49 size_t argsize = mbstowcs(NULL, arg, 0);
50#endif
51 size_t count;
52 unsigned char *in;
53 wchar_t *out;
54#ifdef HAVE_MBRTOWC
55 mbstate_t mbs;
56#endif
57 if (argsize != (size_t)-1) {
58 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
59 if (!res)
60 goto oom;
61 count = mbstowcs(res, arg, argsize+1);
62 if (count != (size_t)-1) {
63 wchar_t *tmp;
64 /* Only use the result if it contains no
65 surrogate characters. */
66 for (tmp = res; *tmp != 0 &&
67 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
68 ;
Victor Stinner168e1172010-10-16 23:16:16 +000069 if (*tmp == 0) {
70 if (size != NULL)
71 *size = count;
Victor Stinner4e314432010-10-07 21:45:39 +000072 return res;
Victor Stinner168e1172010-10-16 23:16:16 +000073 }
Victor Stinner4e314432010-10-07 21:45:39 +000074 }
75 PyMem_Free(res);
76 }
77 /* Conversion failed. Fall back to escaping with surrogateescape. */
78#ifdef HAVE_MBRTOWC
79 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
80
81 /* Overallocate; as multi-byte characters are in the argument, the
82 actual output could use less memory. */
83 argsize = strlen(arg) + 1;
84 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
Victor Stinner19de4c32010-11-08 23:30:46 +000085 if (!res)
86 goto oom;
Victor Stinner4e314432010-10-07 21:45:39 +000087 in = (unsigned char*)arg;
88 out = res;
89 memset(&mbs, 0, sizeof mbs);
90 while (argsize) {
91 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
92 if (converted == 0)
93 /* Reached end of string; null char stored. */
94 break;
95 if (converted == (size_t)-2) {
96 /* Incomplete character. This should never happen,
97 since we provide everything that we have -
98 unless there is a bug in the C library, or I
99 misunderstood how mbrtowc works. */
100 fprintf(stderr, "unexpected mbrtowc result -2\n");
Victor Stinner19de4c32010-11-08 23:30:46 +0000101 PyMem_Free(res);
Victor Stinner4e314432010-10-07 21:45:39 +0000102 return NULL;
103 }
104 if (converted == (size_t)-1) {
105 /* Conversion error. Escape as UTF-8b, and start over
106 in the initial shift state. */
107 *out++ = 0xdc00 + *in++;
108 argsize--;
109 memset(&mbs, 0, sizeof mbs);
110 continue;
111 }
112 if (*out >= 0xd800 && *out <= 0xdfff) {
113 /* Surrogate character. Escape the original
114 byte sequence with surrogateescape. */
115 argsize -= converted;
116 while (converted--)
117 *out++ = 0xdc00 + *in++;
118 continue;
119 }
120 /* successfully converted some bytes */
121 in += converted;
122 argsize -= converted;
123 out++;
124 }
Victor Stinner27b1ca22012-12-03 12:47:59 +0100125#else /* HAVE_MBRTOWC */
Victor Stinner4e314432010-10-07 21:45:39 +0000126 /* Cannot use C locale for escaping; manually escape as if charset
127 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
128 correctly in the locale's charset, which must be an ASCII superset. */
129 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
130 if (!res) goto oom;
131 in = (unsigned char*)arg;
132 out = res;
133 while(*in)
134 if(*in < 128)
135 *out++ = *in++;
136 else
137 *out++ = 0xdc00 + *in++;
138 *out = 0;
Victor Stinner27b1ca22012-12-03 12:47:59 +0100139#endif /* HAVE_MBRTOWC */
Victor Stinner168e1172010-10-16 23:16:16 +0000140 if (size != NULL)
141 *size = out - res;
Victor Stinner4e314432010-10-07 21:45:39 +0000142 return res;
143oom:
144 fprintf(stderr, "out of memory\n");
145 return NULL;
Victor Stinner27b1ca22012-12-03 12:47:59 +0100146#endif /* __APPLE__ */
Victor Stinner4e314432010-10-07 21:45:39 +0000147}
148
149/* Encode a (wide) character string to the locale encoding with the
150 surrogateescape error handler (characters in range U+DC80..U+DCFF are
151 converted to bytes 0x80..0xFF).
152
153 This function is the reverse of _Py_char2wchar().
154
155 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
Victor Stinner2f02a512010-11-08 22:43:46 +0000156 the memory), or NULL on conversion or memory allocation error.
157
158 If error_pos is not NULL: *error_pos is the index of the invalid character
159 on conversion error, or (size_t)-1 otherwise. */
Victor Stinner4e314432010-10-07 21:45:39 +0000160char*
Victor Stinner2f02a512010-11-08 22:43:46 +0000161_Py_wchar2char(const wchar_t *text, size_t *error_pos)
Victor Stinner4e314432010-10-07 21:45:39 +0000162{
Victor Stinner27b1ca22012-12-03 12:47:59 +0100163#ifdef __APPLE__
164 Py_ssize_t len;
165 PyObject *unicode, *bytes = NULL;
166 char *cpath;
167
168 unicode = PyUnicode_FromWideChar(text, wcslen(text));
169 if (unicode == NULL)
170 return NULL;
171
Victor Stinner41a234a2012-12-03 14:11:57 +0100172 bytes = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
173 PyUnicode_GET_SIZE(unicode),
174 "surrogateescape");
Victor Stinner27b1ca22012-12-03 12:47:59 +0100175 Py_DECREF(unicode);
176 if (bytes == NULL) {
177 PyErr_Clear();
178 if (error_pos != NULL)
179 *error_pos = (size_t)-1;
180 return NULL;
181 }
182
183 len = PyBytes_GET_SIZE(bytes);
184 cpath = PyMem_Malloc(len+1);
185 if (cpath == NULL) {
186 PyErr_Clear();
187 Py_DECREF(bytes);
188 if (error_pos != NULL)
189 *error_pos = (size_t)-1;
190 return NULL;
191 }
192 memcpy(cpath, PyBytes_AsString(bytes), len + 1);
193 Py_DECREF(bytes);
194 return cpath;
195#else /* __APPLE__ */
Victor Stinner4e314432010-10-07 21:45:39 +0000196 const size_t len = wcslen(text);
197 char *result = NULL, *bytes = NULL;
198 size_t i, size, converted;
199 wchar_t c, buf[2];
200
201 /* The function works in two steps:
202 1. compute the length of the output buffer in bytes (size)
203 2. outputs the bytes */
204 size = 0;
205 buf[1] = 0;
206 while (1) {
207 for (i=0; i < len; i++) {
208 c = text[i];
209 if (c >= 0xdc80 && c <= 0xdcff) {
210 /* UTF-8b surrogate */
211 if (bytes != NULL) {
212 *bytes++ = c - 0xdc00;
213 size--;
214 }
215 else
216 size++;
217 continue;
218 }
219 else {
220 buf[0] = c;
221 if (bytes != NULL)
222 converted = wcstombs(bytes, buf, size);
223 else
224 converted = wcstombs(NULL, buf, 0);
225 if (converted == (size_t)-1) {
226 if (result != NULL)
227 PyMem_Free(result);
Victor Stinner2f02a512010-11-08 22:43:46 +0000228 if (error_pos != NULL)
229 *error_pos = i;
Victor Stinner4e314432010-10-07 21:45:39 +0000230 return NULL;
231 }
232 if (bytes != NULL) {
233 bytes += converted;
234 size -= converted;
235 }
236 else
237 size += converted;
238 }
239 }
240 if (result != NULL) {
241 *bytes = 0;
242 break;
243 }
244
245 size += 1; /* nul byte at the end */
246 result = PyMem_Malloc(size);
Victor Stinner27b1ca22012-12-03 12:47:59 +0100247 if (result == NULL) {
248 if (error_pos != NULL)
249 *error_pos = (size_t)-1;
Victor Stinner4e314432010-10-07 21:45:39 +0000250 return NULL;
Victor Stinner27b1ca22012-12-03 12:47:59 +0100251 }
Victor Stinner4e314432010-10-07 21:45:39 +0000252 bytes = result;
253 }
254 return result;
Victor Stinner27b1ca22012-12-03 12:47:59 +0100255#endif /* __APPLE__ */
Victor Stinner4e314432010-10-07 21:45:39 +0000256}
257
Victor Stinner4e314432010-10-07 21:45:39 +0000258/* In principle, this should use HAVE__WSTAT, and _wstat
259 should be detected by autoconf. However, no current
260 POSIX system provides that function, so testing for
261 it is pointless.
262 Not sure whether the MS_WINDOWS guards are necessary:
263 perhaps for cygwin/mingw builds?
264*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000265#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000266
267/* Get file status. Encode the path to the locale encoding. */
268
Victor Stinnerb306d752010-10-07 22:09:40 +0000269int
270_Py_wstat(const wchar_t* path, struct stat *buf)
271{
Victor Stinner4e314432010-10-07 21:45:39 +0000272 int err;
273 char *fname;
Victor Stinner2f02a512010-11-08 22:43:46 +0000274 fname = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000275 if (fname == NULL) {
276 errno = EINVAL;
277 return -1;
278 }
279 err = stat(fname, buf);
280 PyMem_Free(fname);
281 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000282}
283#endif
284
Victor Stinner6672d0c2010-10-07 22:53:43 +0000285/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
286 call stat() otherwise. Only fill st_mode attribute on Windows.
287
288 Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
289 unicode error. */
Victor Stinner4e314432010-10-07 21:45:39 +0000290
291int
Victor Stinnera4a75952010-10-07 22:23:10 +0000292_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000293{
294#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000295 int err;
296 struct _stat wstatbuf;
297
Victor Stinnera4a75952010-10-07 22:23:10 +0000298 err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000299 if (!err)
300 statbuf->st_mode = wstatbuf.st_mode;
301 return err;
302#else
303 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000304 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000305 if (bytes == NULL)
306 return -1;
307 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
308 Py_DECREF(bytes);
309 return ret;
310#endif
311}
312
Victor Stinner6672d0c2010-10-07 22:53:43 +0000313/* Open a file. Use _wfopen() on Windows, encode the path to the locale
314 encoding and use fopen() otherwise. */
315
Victor Stinner4e314432010-10-07 21:45:39 +0000316FILE *
317_Py_wfopen(const wchar_t *path, const wchar_t *mode)
318{
319#ifndef MS_WINDOWS
320 FILE *f;
321 char *cpath;
322 char cmode[10];
323 size_t r;
324 r = wcstombs(cmode, mode, 10);
325 if (r == (size_t)-1 || r >= 10) {
326 errno = EINVAL;
327 return NULL;
328 }
Victor Stinner2f02a512010-11-08 22:43:46 +0000329 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000330 if (cpath == NULL)
331 return NULL;
332 f = fopen(cpath, cmode);
333 PyMem_Free(cpath);
334 return f;
335#else
336 return _wfopen(path, mode);
337#endif
338}
339
Victor Stinner6672d0c2010-10-07 22:53:43 +0000340/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
341 call fopen() otherwise.
342
343 Return the new file object on success, or NULL if the file cannot be open or
344 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000345
346FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000347_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000348{
349#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000350 wchar_t wmode[10];
351 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000352
353 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
354 if (usize == 0)
355 return NULL;
356
Victor Stinnera4a75952010-10-07 22:23:10 +0000357 return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000358#else
359 FILE *f;
Victor Stinnera4a75952010-10-07 22:23:10 +0000360 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000361 if (bytes == NULL)
362 return NULL;
363 f = fopen(PyBytes_AS_STRING(bytes), mode);
364 Py_DECREF(bytes);
365 return f;
366#endif
367}
368
369#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000370
371/* Read value of symbolic link. Encode the path to the locale encoding, decode
372 the result from the locale encoding. */
373
Victor Stinner4e314432010-10-07 21:45:39 +0000374int
375_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
376{
377 char *cpath;
378 char cbuf[PATH_MAX];
Victor Stinner3f711f42010-10-16 22:47:37 +0000379 wchar_t *wbuf;
Victor Stinner4e314432010-10-07 21:45:39 +0000380 int res;
381 size_t r1;
382
Victor Stinner2f02a512010-11-08 22:43:46 +0000383 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000384 if (cpath == NULL) {
385 errno = EINVAL;
386 return -1;
387 }
388 res = (int)readlink(cpath, cbuf, PATH_MAX);
389 PyMem_Free(cpath);
390 if (res == -1)
391 return -1;
392 if (res == PATH_MAX) {
393 errno = EINVAL;
394 return -1;
395 }
396 cbuf[res] = '\0'; /* buf will be null terminated */
Victor Stinner168e1172010-10-16 23:16:16 +0000397 wbuf = _Py_char2wchar(cbuf, &r1);
Victor Stinner350147b2010-10-16 22:52:09 +0000398 if (wbuf == NULL) {
399 errno = EINVAL;
400 return -1;
401 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000402 if (bufsiz <= r1) {
403 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000404 errno = EINVAL;
405 return -1;
406 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000407 wcsncpy(buf, wbuf, bufsiz);
408 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000409 return (int)r1;
410}
411#endif
412
413#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000414
415/* Return the canonicalized absolute pathname. Encode path to the locale
416 encoding, decode the result from the locale encoding. */
417
Victor Stinner4e314432010-10-07 21:45:39 +0000418wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000419_Py_wrealpath(const wchar_t *path,
420 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000421{
422 char *cpath;
423 char cresolved_path[PATH_MAX];
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000424 wchar_t *wresolved_path;
Victor Stinner4e314432010-10-07 21:45:39 +0000425 char *res;
426 size_t r;
Victor Stinner2f02a512010-11-08 22:43:46 +0000427 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000428 if (cpath == NULL) {
429 errno = EINVAL;
430 return NULL;
431 }
432 res = realpath(cpath, cresolved_path);
433 PyMem_Free(cpath);
434 if (res == NULL)
435 return NULL;
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000436
Victor Stinner168e1172010-10-16 23:16:16 +0000437 wresolved_path = _Py_char2wchar(cresolved_path, &r);
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000438 if (wresolved_path == NULL) {
Victor Stinner4e314432010-10-07 21:45:39 +0000439 errno = EINVAL;
440 return NULL;
441 }
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000442 if (resolved_path_size <= r) {
443 PyMem_Free(wresolved_path);
444 errno = EINVAL;
445 return NULL;
446 }
447 wcsncpy(resolved_path, wresolved_path, resolved_path_size);
448 PyMem_Free(wresolved_path);
Victor Stinner4e314432010-10-07 21:45:39 +0000449 return resolved_path;
450}
451#endif
452
Victor Stinnerf4061da2010-10-14 12:37:19 +0000453/* Get the current directory. size is the buffer size in wide characters
454 including the null character. Decode the path from the locale encoding. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000455
Victor Stinner4e314432010-10-07 21:45:39 +0000456wchar_t*
457_Py_wgetcwd(wchar_t *buf, size_t size)
458{
459#ifdef MS_WINDOWS
460 return _wgetcwd(buf, size);
461#else
462 char fname[PATH_MAX];
Victor Stinnerf4061da2010-10-14 12:37:19 +0000463 wchar_t *wname;
Victor Stinner168e1172010-10-16 23:16:16 +0000464 size_t len;
Victor Stinnerf4061da2010-10-14 12:37:19 +0000465
Victor Stinner4e314432010-10-07 21:45:39 +0000466 if (getcwd(fname, PATH_MAX) == NULL)
467 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000468 wname = _Py_char2wchar(fname, &len);
Victor Stinnerf4061da2010-10-14 12:37:19 +0000469 if (wname == NULL)
470 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000471 if (size <= len) {
Victor Stinnerf4061da2010-10-14 12:37:19 +0000472 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000473 return NULL;
474 }
Victor Stinnerf4061da2010-10-14 12:37:19 +0000475 wcsncpy(buf, wname, size);
476 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000477 return buf;
478#endif
479}
480
481#endif