blob: cba6696695c7aad5dfb6a7406afa08b81e60c823 [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
Victor Stinner27b1ca22012-12-03 12:47:59 +01006#ifdef __APPLE__
7extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
8#endif
9
Victor Stinner4e314432010-10-07 21:45:39 +000010#ifdef HAVE_STAT
11
12/* Decode a byte string from the locale encoding with the
13 surrogateescape error handler (undecodable bytes are decoded as characters
14 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
15 character, escape the bytes using the surrogateescape error handler instead
16 of decoding them.
17
18 Use _Py_wchar2char() to encode the character string back to a byte string.
19
Victor Stinner168e1172010-10-16 23:16:16 +000020 Return a pointer to a newly allocated wide character string (use
21 PyMem_Free() to free the memory) and write the number of written wide
22 characters excluding the null character into *size if size is not NULL, or
Victor Stinner19de4c32010-11-08 23:30:46 +000023 NULL on error (conversion or memory allocation error).
24
25 Conversion errors should never happen, unless there is a bug in the C
26 library. */
Victor Stinner4e314432010-10-07 21:45:39 +000027wchar_t*
Victor Stinner168e1172010-10-16 23:16:16 +000028_Py_char2wchar(const char* arg, size_t *size)
Victor Stinner4e314432010-10-07 21:45:39 +000029{
Victor Stinner27b1ca22012-12-03 12:47:59 +010030#ifdef __APPLE__
31 wchar_t *wstr;
32 wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
33 if (size != NULL) {
34 if (wstr != NULL)
35 *size = wcslen(wstr);
36 else
37 *size = (size_t)-1;
38 }
39 return wstr;
40#else
Victor Stinner4e314432010-10-07 21:45:39 +000041 wchar_t *res;
42#ifdef HAVE_BROKEN_MBSTOWCS
43 /* Some platforms have a broken implementation of
44 * mbstowcs which does not count the characters that
45 * would result from conversion. Use an upper bound.
46 */
47 size_t argsize = strlen(arg);
48#else
49 size_t argsize = mbstowcs(NULL, arg, 0);
50#endif
51 size_t count;
52 unsigned char *in;
53 wchar_t *out;
54#ifdef HAVE_MBRTOWC
55 mbstate_t mbs;
56#endif
57 if (argsize != (size_t)-1) {
58 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
59 if (!res)
60 goto oom;
61 count = mbstowcs(res, arg, argsize+1);
62 if (count != (size_t)-1) {
63 wchar_t *tmp;
64 /* Only use the result if it contains no
65 surrogate characters. */
66 for (tmp = res; *tmp != 0 &&
67 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
68 ;
Victor Stinner168e1172010-10-16 23:16:16 +000069 if (*tmp == 0) {
70 if (size != NULL)
71 *size = count;
Victor Stinner4e314432010-10-07 21:45:39 +000072 return res;
Victor Stinner168e1172010-10-16 23:16:16 +000073 }
Victor Stinner4e314432010-10-07 21:45:39 +000074 }
75 PyMem_Free(res);
76 }
77 /* Conversion failed. Fall back to escaping with surrogateescape. */
78#ifdef HAVE_MBRTOWC
79 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
80
81 /* Overallocate; as multi-byte characters are in the argument, the
82 actual output could use less memory. */
83 argsize = strlen(arg) + 1;
84 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
Victor Stinner19de4c32010-11-08 23:30:46 +000085 if (!res)
86 goto oom;
Victor Stinner4e314432010-10-07 21:45:39 +000087 in = (unsigned char*)arg;
88 out = res;
89 memset(&mbs, 0, sizeof mbs);
90 while (argsize) {
91 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
92 if (converted == 0)
93 /* Reached end of string; null char stored. */
94 break;
95 if (converted == (size_t)-2) {
96 /* Incomplete character. This should never happen,
97 since we provide everything that we have -
98 unless there is a bug in the C library, or I
99 misunderstood how mbrtowc works. */
100 fprintf(stderr, "unexpected mbrtowc result -2\n");
Victor Stinner19de4c32010-11-08 23:30:46 +0000101 PyMem_Free(res);
Victor Stinner4e314432010-10-07 21:45:39 +0000102 return NULL;
103 }
104 if (converted == (size_t)-1) {
105 /* Conversion error. Escape as UTF-8b, and start over
106 in the initial shift state. */
107 *out++ = 0xdc00 + *in++;
108 argsize--;
109 memset(&mbs, 0, sizeof mbs);
110 continue;
111 }
112 if (*out >= 0xd800 && *out <= 0xdfff) {
113 /* Surrogate character. Escape the original
114 byte sequence with surrogateescape. */
115 argsize -= converted;
116 while (converted--)
117 *out++ = 0xdc00 + *in++;
118 continue;
119 }
120 /* successfully converted some bytes */
121 in += converted;
122 argsize -= converted;
123 out++;
124 }
Victor Stinner27b1ca22012-12-03 12:47:59 +0100125#else /* HAVE_MBRTOWC */
Victor Stinner4e314432010-10-07 21:45:39 +0000126 /* Cannot use C locale for escaping; manually escape as if charset
127 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
128 correctly in the locale's charset, which must be an ASCII superset. */
129 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
130 if (!res) goto oom;
131 in = (unsigned char*)arg;
132 out = res;
133 while(*in)
134 if(*in < 128)
135 *out++ = *in++;
136 else
137 *out++ = 0xdc00 + *in++;
138 *out = 0;
Victor Stinner27b1ca22012-12-03 12:47:59 +0100139#endif /* HAVE_MBRTOWC */
Victor Stinner168e1172010-10-16 23:16:16 +0000140 if (size != NULL)
141 *size = out - res;
Victor Stinner4e314432010-10-07 21:45:39 +0000142 return res;
143oom:
144 fprintf(stderr, "out of memory\n");
145 return NULL;
Victor Stinner27b1ca22012-12-03 12:47:59 +0100146#endif /* __APPLE__ */
Victor Stinner4e314432010-10-07 21:45:39 +0000147}
148
149/* Encode a (wide) character string to the locale encoding with the
150 surrogateescape error handler (characters in range U+DC80..U+DCFF are
151 converted to bytes 0x80..0xFF).
152
153 This function is the reverse of _Py_char2wchar().
154
155 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
Victor Stinner2f02a512010-11-08 22:43:46 +0000156 the memory), or NULL on conversion or memory allocation error.
157
158 If error_pos is not NULL: *error_pos is the index of the invalid character
159 on conversion error, or (size_t)-1 otherwise. */
Victor Stinner4e314432010-10-07 21:45:39 +0000160char*
Victor Stinner2f02a512010-11-08 22:43:46 +0000161_Py_wchar2char(const wchar_t *text, size_t *error_pos)
Victor Stinner4e314432010-10-07 21:45:39 +0000162{
Victor Stinner27b1ca22012-12-03 12:47:59 +0100163#ifdef __APPLE__
164 Py_ssize_t len;
165 PyObject *unicode, *bytes = NULL;
166 char *cpath;
167
168 unicode = PyUnicode_FromWideChar(text, wcslen(text));
169 if (unicode == NULL)
170 return NULL;
171
172 bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
173 Py_DECREF(unicode);
174 if (bytes == NULL) {
175 PyErr_Clear();
176 if (error_pos != NULL)
177 *error_pos = (size_t)-1;
178 return NULL;
179 }
180
181 len = PyBytes_GET_SIZE(bytes);
182 cpath = PyMem_Malloc(len+1);
183 if (cpath == NULL) {
184 PyErr_Clear();
185 Py_DECREF(bytes);
186 if (error_pos != NULL)
187 *error_pos = (size_t)-1;
188 return NULL;
189 }
190 memcpy(cpath, PyBytes_AsString(bytes), len + 1);
191 Py_DECREF(bytes);
192 return cpath;
193#else /* __APPLE__ */
Victor Stinner4e314432010-10-07 21:45:39 +0000194 const size_t len = wcslen(text);
195 char *result = NULL, *bytes = NULL;
196 size_t i, size, converted;
197 wchar_t c, buf[2];
198
199 /* The function works in two steps:
200 1. compute the length of the output buffer in bytes (size)
201 2. outputs the bytes */
202 size = 0;
203 buf[1] = 0;
204 while (1) {
205 for (i=0; i < len; i++) {
206 c = text[i];
207 if (c >= 0xdc80 && c <= 0xdcff) {
208 /* UTF-8b surrogate */
209 if (bytes != NULL) {
210 *bytes++ = c - 0xdc00;
211 size--;
212 }
213 else
214 size++;
215 continue;
216 }
217 else {
218 buf[0] = c;
219 if (bytes != NULL)
220 converted = wcstombs(bytes, buf, size);
221 else
222 converted = wcstombs(NULL, buf, 0);
223 if (converted == (size_t)-1) {
224 if (result != NULL)
225 PyMem_Free(result);
Victor Stinner2f02a512010-11-08 22:43:46 +0000226 if (error_pos != NULL)
227 *error_pos = i;
Victor Stinner4e314432010-10-07 21:45:39 +0000228 return NULL;
229 }
230 if (bytes != NULL) {
231 bytes += converted;
232 size -= converted;
233 }
234 else
235 size += converted;
236 }
237 }
238 if (result != NULL) {
239 *bytes = 0;
240 break;
241 }
242
243 size += 1; /* nul byte at the end */
244 result = PyMem_Malloc(size);
Victor Stinner27b1ca22012-12-03 12:47:59 +0100245 if (result == NULL) {
246 if (error_pos != NULL)
247 *error_pos = (size_t)-1;
Victor Stinner4e314432010-10-07 21:45:39 +0000248 return NULL;
Victor Stinner27b1ca22012-12-03 12:47:59 +0100249 }
Victor Stinner4e314432010-10-07 21:45:39 +0000250 bytes = result;
251 }
252 return result;
Victor Stinner27b1ca22012-12-03 12:47:59 +0100253#endif /* __APPLE__ */
Victor Stinner4e314432010-10-07 21:45:39 +0000254}
255
Victor Stinner4e314432010-10-07 21:45:39 +0000256/* In principle, this should use HAVE__WSTAT, and _wstat
257 should be detected by autoconf. However, no current
258 POSIX system provides that function, so testing for
259 it is pointless.
260 Not sure whether the MS_WINDOWS guards are necessary:
261 perhaps for cygwin/mingw builds?
262*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000263#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000264
265/* Get file status. Encode the path to the locale encoding. */
266
Victor Stinnerb306d752010-10-07 22:09:40 +0000267int
268_Py_wstat(const wchar_t* path, struct stat *buf)
269{
Victor Stinner4e314432010-10-07 21:45:39 +0000270 int err;
271 char *fname;
Victor Stinner2f02a512010-11-08 22:43:46 +0000272 fname = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000273 if (fname == NULL) {
274 errno = EINVAL;
275 return -1;
276 }
277 err = stat(fname, buf);
278 PyMem_Free(fname);
279 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000280}
281#endif
282
Victor Stinner6672d0c2010-10-07 22:53:43 +0000283/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
284 call stat() otherwise. Only fill st_mode attribute on Windows.
285
286 Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
287 unicode error. */
Victor Stinner4e314432010-10-07 21:45:39 +0000288
289int
Victor Stinnera4a75952010-10-07 22:23:10 +0000290_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000291{
292#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000293 int err;
294 struct _stat wstatbuf;
295
Victor Stinnera4a75952010-10-07 22:23:10 +0000296 err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000297 if (!err)
298 statbuf->st_mode = wstatbuf.st_mode;
299 return err;
300#else
301 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000302 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000303 if (bytes == NULL)
304 return -1;
305 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
306 Py_DECREF(bytes);
307 return ret;
308#endif
309}
310
Victor Stinner6672d0c2010-10-07 22:53:43 +0000311/* Open a file. Use _wfopen() on Windows, encode the path to the locale
312 encoding and use fopen() otherwise. */
313
Victor Stinner4e314432010-10-07 21:45:39 +0000314FILE *
315_Py_wfopen(const wchar_t *path, const wchar_t *mode)
316{
317#ifndef MS_WINDOWS
318 FILE *f;
319 char *cpath;
320 char cmode[10];
321 size_t r;
322 r = wcstombs(cmode, mode, 10);
323 if (r == (size_t)-1 || r >= 10) {
324 errno = EINVAL;
325 return NULL;
326 }
Victor Stinner2f02a512010-11-08 22:43:46 +0000327 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000328 if (cpath == NULL)
329 return NULL;
330 f = fopen(cpath, cmode);
331 PyMem_Free(cpath);
332 return f;
333#else
334 return _wfopen(path, mode);
335#endif
336}
337
Victor Stinner6672d0c2010-10-07 22:53:43 +0000338/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
339 call fopen() otherwise.
340
341 Return the new file object on success, or NULL if the file cannot be open or
342 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000343
344FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000345_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000346{
347#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000348 wchar_t wmode[10];
349 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000350
351 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
352 if (usize == 0)
353 return NULL;
354
Victor Stinnera4a75952010-10-07 22:23:10 +0000355 return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000356#else
357 FILE *f;
Victor Stinnera4a75952010-10-07 22:23:10 +0000358 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000359 if (bytes == NULL)
360 return NULL;
361 f = fopen(PyBytes_AS_STRING(bytes), mode);
362 Py_DECREF(bytes);
363 return f;
364#endif
365}
366
367#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000368
369/* Read value of symbolic link. Encode the path to the locale encoding, decode
370 the result from the locale encoding. */
371
Victor Stinner4e314432010-10-07 21:45:39 +0000372int
373_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
374{
375 char *cpath;
376 char cbuf[PATH_MAX];
Victor Stinner3f711f42010-10-16 22:47:37 +0000377 wchar_t *wbuf;
Victor Stinner4e314432010-10-07 21:45:39 +0000378 int res;
379 size_t r1;
380
Victor Stinner2f02a512010-11-08 22:43:46 +0000381 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000382 if (cpath == NULL) {
383 errno = EINVAL;
384 return -1;
385 }
386 res = (int)readlink(cpath, cbuf, PATH_MAX);
387 PyMem_Free(cpath);
388 if (res == -1)
389 return -1;
390 if (res == PATH_MAX) {
391 errno = EINVAL;
392 return -1;
393 }
394 cbuf[res] = '\0'; /* buf will be null terminated */
Victor Stinner168e1172010-10-16 23:16:16 +0000395 wbuf = _Py_char2wchar(cbuf, &r1);
Victor Stinner350147b2010-10-16 22:52:09 +0000396 if (wbuf == NULL) {
397 errno = EINVAL;
398 return -1;
399 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000400 if (bufsiz <= r1) {
401 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000402 errno = EINVAL;
403 return -1;
404 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000405 wcsncpy(buf, wbuf, bufsiz);
406 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000407 return (int)r1;
408}
409#endif
410
411#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000412
413/* Return the canonicalized absolute pathname. Encode path to the locale
414 encoding, decode the result from the locale encoding. */
415
Victor Stinner4e314432010-10-07 21:45:39 +0000416wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000417_Py_wrealpath(const wchar_t *path,
418 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000419{
420 char *cpath;
421 char cresolved_path[PATH_MAX];
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000422 wchar_t *wresolved_path;
Victor Stinner4e314432010-10-07 21:45:39 +0000423 char *res;
424 size_t r;
Victor Stinner2f02a512010-11-08 22:43:46 +0000425 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000426 if (cpath == NULL) {
427 errno = EINVAL;
428 return NULL;
429 }
430 res = realpath(cpath, cresolved_path);
431 PyMem_Free(cpath);
432 if (res == NULL)
433 return NULL;
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000434
Victor Stinner168e1172010-10-16 23:16:16 +0000435 wresolved_path = _Py_char2wchar(cresolved_path, &r);
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000436 if (wresolved_path == NULL) {
Victor Stinner4e314432010-10-07 21:45:39 +0000437 errno = EINVAL;
438 return NULL;
439 }
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000440 if (resolved_path_size <= r) {
441 PyMem_Free(wresolved_path);
442 errno = EINVAL;
443 return NULL;
444 }
445 wcsncpy(resolved_path, wresolved_path, resolved_path_size);
446 PyMem_Free(wresolved_path);
Victor Stinner4e314432010-10-07 21:45:39 +0000447 return resolved_path;
448}
449#endif
450
Victor Stinnerf4061da2010-10-14 12:37:19 +0000451/* Get the current directory. size is the buffer size in wide characters
452 including the null character. Decode the path from the locale encoding. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000453
Victor Stinner4e314432010-10-07 21:45:39 +0000454wchar_t*
455_Py_wgetcwd(wchar_t *buf, size_t size)
456{
457#ifdef MS_WINDOWS
458 return _wgetcwd(buf, size);
459#else
460 char fname[PATH_MAX];
Victor Stinnerf4061da2010-10-14 12:37:19 +0000461 wchar_t *wname;
Victor Stinner168e1172010-10-16 23:16:16 +0000462 size_t len;
Victor Stinnerf4061da2010-10-14 12:37:19 +0000463
Victor Stinner4e314432010-10-07 21:45:39 +0000464 if (getcwd(fname, PATH_MAX) == NULL)
465 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000466 wname = _Py_char2wchar(fname, &len);
Victor Stinnerf4061da2010-10-14 12:37:19 +0000467 if (wname == NULL)
468 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000469 if (size <= len) {
Victor Stinnerf4061da2010-10-14 12:37:19 +0000470 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000471 return NULL;
472 }
Victor Stinnerf4061da2010-10-14 12:37:19 +0000473 wcsncpy(buf, wname, size);
474 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000475 return buf;
476#endif
477}
478
479#endif