blob: 0afa415d59b3f5cc3f540eb10e990a9ea2d224f2 [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
6#ifdef HAVE_STAT
7
8/* Decode a byte string from the locale encoding with the
9 surrogateescape error handler (undecodable bytes are decoded as characters
10 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
11 character, escape the bytes using the surrogateescape error handler instead
12 of decoding them.
13
14 Use _Py_wchar2char() to encode the character string back to a byte string.
15
Victor Stinner168e1172010-10-16 23:16:16 +000016 Return a pointer to a newly allocated wide character string (use
17 PyMem_Free() to free the memory) and write the number of written wide
18 characters excluding the null character into *size if size is not NULL, or
Victor Stinner19de4c32010-11-08 23:30:46 +000019 NULL on error (conversion or memory allocation error).
20
21 Conversion errors should never happen, unless there is a bug in the C
22 library. */
Victor Stinner4e314432010-10-07 21:45:39 +000023wchar_t*
Victor Stinner168e1172010-10-16 23:16:16 +000024_Py_char2wchar(const char* arg, size_t *size)
Victor Stinner4e314432010-10-07 21:45:39 +000025{
26 wchar_t *res;
27#ifdef HAVE_BROKEN_MBSTOWCS
28 /* Some platforms have a broken implementation of
29 * mbstowcs which does not count the characters that
30 * would result from conversion. Use an upper bound.
31 */
32 size_t argsize = strlen(arg);
33#else
34 size_t argsize = mbstowcs(NULL, arg, 0);
35#endif
36 size_t count;
37 unsigned char *in;
38 wchar_t *out;
39#ifdef HAVE_MBRTOWC
40 mbstate_t mbs;
41#endif
42 if (argsize != (size_t)-1) {
43 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
44 if (!res)
45 goto oom;
46 count = mbstowcs(res, arg, argsize+1);
47 if (count != (size_t)-1) {
48 wchar_t *tmp;
49 /* Only use the result if it contains no
50 surrogate characters. */
51 for (tmp = res; *tmp != 0 &&
52 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
53 ;
Victor Stinner168e1172010-10-16 23:16:16 +000054 if (*tmp == 0) {
55 if (size != NULL)
56 *size = count;
Victor Stinner4e314432010-10-07 21:45:39 +000057 return res;
Victor Stinner168e1172010-10-16 23:16:16 +000058 }
Victor Stinner4e314432010-10-07 21:45:39 +000059 }
60 PyMem_Free(res);
61 }
62 /* Conversion failed. Fall back to escaping with surrogateescape. */
63#ifdef HAVE_MBRTOWC
64 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
65
66 /* Overallocate; as multi-byte characters are in the argument, the
67 actual output could use less memory. */
68 argsize = strlen(arg) + 1;
69 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
Victor Stinner19de4c32010-11-08 23:30:46 +000070 if (!res)
71 goto oom;
Victor Stinner4e314432010-10-07 21:45:39 +000072 in = (unsigned char*)arg;
73 out = res;
74 memset(&mbs, 0, sizeof mbs);
75 while (argsize) {
76 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
77 if (converted == 0)
78 /* Reached end of string; null char stored. */
79 break;
80 if (converted == (size_t)-2) {
81 /* Incomplete character. This should never happen,
82 since we provide everything that we have -
83 unless there is a bug in the C library, or I
84 misunderstood how mbrtowc works. */
85 fprintf(stderr, "unexpected mbrtowc result -2\n");
Victor Stinner19de4c32010-11-08 23:30:46 +000086 PyMem_Free(res);
Victor Stinner4e314432010-10-07 21:45:39 +000087 return NULL;
88 }
89 if (converted == (size_t)-1) {
90 /* Conversion error. Escape as UTF-8b, and start over
91 in the initial shift state. */
92 *out++ = 0xdc00 + *in++;
93 argsize--;
94 memset(&mbs, 0, sizeof mbs);
95 continue;
96 }
97 if (*out >= 0xd800 && *out <= 0xdfff) {
98 /* Surrogate character. Escape the original
99 byte sequence with surrogateescape. */
100 argsize -= converted;
101 while (converted--)
102 *out++ = 0xdc00 + *in++;
103 continue;
104 }
105 /* successfully converted some bytes */
106 in += converted;
107 argsize -= converted;
108 out++;
109 }
110#else
111 /* Cannot use C locale for escaping; manually escape as if charset
112 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
113 correctly in the locale's charset, which must be an ASCII superset. */
114 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
115 if (!res) goto oom;
116 in = (unsigned char*)arg;
117 out = res;
118 while(*in)
119 if(*in < 128)
120 *out++ = *in++;
121 else
122 *out++ = 0xdc00 + *in++;
123 *out = 0;
124#endif
Victor Stinner168e1172010-10-16 23:16:16 +0000125 if (size != NULL)
126 *size = out - res;
Victor Stinner4e314432010-10-07 21:45:39 +0000127 return res;
128oom:
129 fprintf(stderr, "out of memory\n");
130 return NULL;
131}
132
133/* Encode a (wide) character string to the locale encoding with the
134 surrogateescape error handler (characters in range U+DC80..U+DCFF are
135 converted to bytes 0x80..0xFF).
136
137 This function is the reverse of _Py_char2wchar().
138
139 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
Victor Stinner2f02a512010-11-08 22:43:46 +0000140 the memory), or NULL on conversion or memory allocation error.
141
142 If error_pos is not NULL: *error_pos is the index of the invalid character
143 on conversion error, or (size_t)-1 otherwise. */
Victor Stinner4e314432010-10-07 21:45:39 +0000144char*
Victor Stinner2f02a512010-11-08 22:43:46 +0000145_Py_wchar2char(const wchar_t *text, size_t *error_pos)
Victor Stinner4e314432010-10-07 21:45:39 +0000146{
147 const size_t len = wcslen(text);
148 char *result = NULL, *bytes = NULL;
149 size_t i, size, converted;
150 wchar_t c, buf[2];
151
Victor Stinner2f02a512010-11-08 22:43:46 +0000152 if (error_pos != NULL)
153 *error_pos = (size_t)-1;
154
Victor Stinner4e314432010-10-07 21:45:39 +0000155 /* The function works in two steps:
156 1. compute the length of the output buffer in bytes (size)
157 2. outputs the bytes */
158 size = 0;
159 buf[1] = 0;
160 while (1) {
161 for (i=0; i < len; i++) {
162 c = text[i];
163 if (c >= 0xdc80 && c <= 0xdcff) {
164 /* UTF-8b surrogate */
165 if (bytes != NULL) {
166 *bytes++ = c - 0xdc00;
167 size--;
168 }
169 else
170 size++;
171 continue;
172 }
173 else {
174 buf[0] = c;
175 if (bytes != NULL)
176 converted = wcstombs(bytes, buf, size);
177 else
178 converted = wcstombs(NULL, buf, 0);
179 if (converted == (size_t)-1) {
180 if (result != NULL)
181 PyMem_Free(result);
Victor Stinner2f02a512010-11-08 22:43:46 +0000182 if (error_pos != NULL)
183 *error_pos = i;
Victor Stinner4e314432010-10-07 21:45:39 +0000184 return NULL;
185 }
186 if (bytes != NULL) {
187 bytes += converted;
188 size -= converted;
189 }
190 else
191 size += converted;
192 }
193 }
194 if (result != NULL) {
195 *bytes = 0;
196 break;
197 }
198
199 size += 1; /* nul byte at the end */
200 result = PyMem_Malloc(size);
201 if (result == NULL)
202 return NULL;
203 bytes = result;
204 }
205 return result;
206}
207
Victor Stinner4e314432010-10-07 21:45:39 +0000208/* In principle, this should use HAVE__WSTAT, and _wstat
209 should be detected by autoconf. However, no current
210 POSIX system provides that function, so testing for
211 it is pointless.
212 Not sure whether the MS_WINDOWS guards are necessary:
213 perhaps for cygwin/mingw builds?
214*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000215#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000216
217/* Get file status. Encode the path to the locale encoding. */
218
Victor Stinnerb306d752010-10-07 22:09:40 +0000219int
220_Py_wstat(const wchar_t* path, struct stat *buf)
221{
Victor Stinner4e314432010-10-07 21:45:39 +0000222 int err;
223 char *fname;
Victor Stinner2f02a512010-11-08 22:43:46 +0000224 fname = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000225 if (fname == NULL) {
226 errno = EINVAL;
227 return -1;
228 }
229 err = stat(fname, buf);
230 PyMem_Free(fname);
231 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000232}
233#endif
234
Victor Stinner6672d0c2010-10-07 22:53:43 +0000235/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
236 call stat() otherwise. Only fill st_mode attribute on Windows.
237
238 Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
239 unicode error. */
Victor Stinner4e314432010-10-07 21:45:39 +0000240
241int
Victor Stinnera4a75952010-10-07 22:23:10 +0000242_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000243{
244#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000245 int err;
246 struct _stat wstatbuf;
Victor Stinneree587ea2011-11-17 00:51:38 +0100247 wchar_t *wpath;
Victor Stinner4e314432010-10-07 21:45:39 +0000248
Victor Stinneree587ea2011-11-17 00:51:38 +0100249 wpath = PyUnicode_AsUnicode(path);
250 if (wpath == NULL)
251 return -1;
252 err = _wstat(wpath, &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000253 if (!err)
254 statbuf->st_mode = wstatbuf.st_mode;
255 return err;
256#else
257 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000258 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000259 if (bytes == NULL)
260 return -1;
261 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
262 Py_DECREF(bytes);
263 return ret;
264#endif
265}
266
Victor Stinner6672d0c2010-10-07 22:53:43 +0000267/* Open a file. Use _wfopen() on Windows, encode the path to the locale
268 encoding and use fopen() otherwise. */
269
Victor Stinner4e314432010-10-07 21:45:39 +0000270FILE *
271_Py_wfopen(const wchar_t *path, const wchar_t *mode)
272{
273#ifndef MS_WINDOWS
274 FILE *f;
275 char *cpath;
276 char cmode[10];
277 size_t r;
278 r = wcstombs(cmode, mode, 10);
279 if (r == (size_t)-1 || r >= 10) {
280 errno = EINVAL;
281 return NULL;
282 }
Victor Stinner2f02a512010-11-08 22:43:46 +0000283 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000284 if (cpath == NULL)
285 return NULL;
286 f = fopen(cpath, cmode);
287 PyMem_Free(cpath);
288 return f;
289#else
290 return _wfopen(path, mode);
291#endif
292}
293
Victor Stinner6672d0c2010-10-07 22:53:43 +0000294/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
295 call fopen() otherwise.
296
297 Return the new file object on success, or NULL if the file cannot be open or
298 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000299
300FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000301_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000302{
303#ifdef MS_WINDOWS
Victor Stinneree587ea2011-11-17 00:51:38 +0100304 wchar_t *wpath;
Victor Stinner4e314432010-10-07 21:45:39 +0000305 wchar_t wmode[10];
306 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000307
Victor Stinneree587ea2011-11-17 00:51:38 +0100308 wpath = PyUnicode_AsUnicode(path);
309 if (wpath == NULL)
310 return NULL;
311
Victor Stinner4e314432010-10-07 21:45:39 +0000312 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
313 if (usize == 0)
314 return NULL;
315
Victor Stinneree587ea2011-11-17 00:51:38 +0100316 return _wfopen(wpath, wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000317#else
318 FILE *f;
Victor Stinnera4a75952010-10-07 22:23:10 +0000319 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000320 if (bytes == NULL)
321 return NULL;
322 f = fopen(PyBytes_AS_STRING(bytes), mode);
323 Py_DECREF(bytes);
324 return f;
325#endif
326}
327
328#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000329
330/* Read value of symbolic link. Encode the path to the locale encoding, decode
331 the result from the locale encoding. */
332
Victor Stinner4e314432010-10-07 21:45:39 +0000333int
334_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
335{
336 char *cpath;
337 char cbuf[PATH_MAX];
Victor Stinner3f711f42010-10-16 22:47:37 +0000338 wchar_t *wbuf;
Victor Stinner4e314432010-10-07 21:45:39 +0000339 int res;
340 size_t r1;
341
Victor Stinner2f02a512010-11-08 22:43:46 +0000342 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000343 if (cpath == NULL) {
344 errno = EINVAL;
345 return -1;
346 }
347 res = (int)readlink(cpath, cbuf, PATH_MAX);
348 PyMem_Free(cpath);
349 if (res == -1)
350 return -1;
351 if (res == PATH_MAX) {
352 errno = EINVAL;
353 return -1;
354 }
355 cbuf[res] = '\0'; /* buf will be null terminated */
Victor Stinner168e1172010-10-16 23:16:16 +0000356 wbuf = _Py_char2wchar(cbuf, &r1);
Victor Stinner350147b2010-10-16 22:52:09 +0000357 if (wbuf == NULL) {
358 errno = EINVAL;
359 return -1;
360 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000361 if (bufsiz <= r1) {
362 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000363 errno = EINVAL;
364 return -1;
365 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000366 wcsncpy(buf, wbuf, bufsiz);
367 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000368 return (int)r1;
369}
370#endif
371
372#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000373
374/* Return the canonicalized absolute pathname. Encode path to the locale
375 encoding, decode the result from the locale encoding. */
376
Victor Stinner4e314432010-10-07 21:45:39 +0000377wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000378_Py_wrealpath(const wchar_t *path,
379 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000380{
381 char *cpath;
382 char cresolved_path[PATH_MAX];
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000383 wchar_t *wresolved_path;
Victor Stinner4e314432010-10-07 21:45:39 +0000384 char *res;
385 size_t r;
Victor Stinner2f02a512010-11-08 22:43:46 +0000386 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000387 if (cpath == NULL) {
388 errno = EINVAL;
389 return NULL;
390 }
391 res = realpath(cpath, cresolved_path);
392 PyMem_Free(cpath);
393 if (res == NULL)
394 return NULL;
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000395
Victor Stinner168e1172010-10-16 23:16:16 +0000396 wresolved_path = _Py_char2wchar(cresolved_path, &r);
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000397 if (wresolved_path == NULL) {
Victor Stinner4e314432010-10-07 21:45:39 +0000398 errno = EINVAL;
399 return NULL;
400 }
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000401 if (resolved_path_size <= r) {
402 PyMem_Free(wresolved_path);
403 errno = EINVAL;
404 return NULL;
405 }
406 wcsncpy(resolved_path, wresolved_path, resolved_path_size);
407 PyMem_Free(wresolved_path);
Victor Stinner4e314432010-10-07 21:45:39 +0000408 return resolved_path;
409}
410#endif
411
Victor Stinnerf4061da2010-10-14 12:37:19 +0000412/* Get the current directory. size is the buffer size in wide characters
413 including the null character. Decode the path from the locale encoding. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000414
Victor Stinner4e314432010-10-07 21:45:39 +0000415wchar_t*
416_Py_wgetcwd(wchar_t *buf, size_t size)
417{
418#ifdef MS_WINDOWS
419 return _wgetcwd(buf, size);
420#else
421 char fname[PATH_MAX];
Victor Stinnerf4061da2010-10-14 12:37:19 +0000422 wchar_t *wname;
Victor Stinner168e1172010-10-16 23:16:16 +0000423 size_t len;
Victor Stinnerf4061da2010-10-14 12:37:19 +0000424
Victor Stinner4e314432010-10-07 21:45:39 +0000425 if (getcwd(fname, PATH_MAX) == NULL)
426 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000427 wname = _Py_char2wchar(fname, &len);
Victor Stinnerf4061da2010-10-14 12:37:19 +0000428 if (wname == NULL)
429 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000430 if (size <= len) {
Victor Stinnerf4061da2010-10-14 12:37:19 +0000431 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000432 return NULL;
433 }
Victor Stinnerf4061da2010-10-14 12:37:19 +0000434 wcsncpy(buf, wname, size);
435 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000436 return buf;
437#endif
438}
439
440#endif