blob: 03fc0cb79dd3de26fda10cac7ea3c3bc2b1e9c69 [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
6#ifdef HAVE_STAT
7
8/* Decode a byte string from the locale encoding with the
9 surrogateescape error handler (undecodable bytes are decoded as characters
10 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
11 character, escape the bytes using the surrogateescape error handler instead
12 of decoding them.
13
14 Use _Py_wchar2char() to encode the character string back to a byte string.
15
Victor Stinner168e1172010-10-16 23:16:16 +000016 Return a pointer to a newly allocated wide character string (use
17 PyMem_Free() to free the memory) and write the number of written wide
18 characters excluding the null character into *size if size is not NULL, or
19 NULL on error (conversion error or memory error). */
Victor Stinner4e314432010-10-07 21:45:39 +000020wchar_t*
Victor Stinner168e1172010-10-16 23:16:16 +000021_Py_char2wchar(const char* arg, size_t *size)
Victor Stinner4e314432010-10-07 21:45:39 +000022{
23 wchar_t *res;
24#ifdef HAVE_BROKEN_MBSTOWCS
25 /* Some platforms have a broken implementation of
26 * mbstowcs which does not count the characters that
27 * would result from conversion. Use an upper bound.
28 */
29 size_t argsize = strlen(arg);
30#else
31 size_t argsize = mbstowcs(NULL, arg, 0);
32#endif
33 size_t count;
34 unsigned char *in;
35 wchar_t *out;
36#ifdef HAVE_MBRTOWC
37 mbstate_t mbs;
38#endif
39 if (argsize != (size_t)-1) {
40 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
41 if (!res)
42 goto oom;
43 count = mbstowcs(res, arg, argsize+1);
44 if (count != (size_t)-1) {
45 wchar_t *tmp;
46 /* Only use the result if it contains no
47 surrogate characters. */
48 for (tmp = res; *tmp != 0 &&
49 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
50 ;
Victor Stinner168e1172010-10-16 23:16:16 +000051 if (*tmp == 0) {
52 if (size != NULL)
53 *size = count;
Victor Stinner4e314432010-10-07 21:45:39 +000054 return res;
Victor Stinner168e1172010-10-16 23:16:16 +000055 }
Victor Stinner4e314432010-10-07 21:45:39 +000056 }
57 PyMem_Free(res);
58 }
59 /* Conversion failed. Fall back to escaping with surrogateescape. */
60#ifdef HAVE_MBRTOWC
61 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
62
63 /* Overallocate; as multi-byte characters are in the argument, the
64 actual output could use less memory. */
65 argsize = strlen(arg) + 1;
66 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
67 if (!res) goto oom;
68 in = (unsigned char*)arg;
69 out = res;
70 memset(&mbs, 0, sizeof mbs);
71 while (argsize) {
72 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
73 if (converted == 0)
74 /* Reached end of string; null char stored. */
75 break;
76 if (converted == (size_t)-2) {
77 /* Incomplete character. This should never happen,
78 since we provide everything that we have -
79 unless there is a bug in the C library, or I
80 misunderstood how mbrtowc works. */
81 fprintf(stderr, "unexpected mbrtowc result -2\n");
82 return NULL;
83 }
84 if (converted == (size_t)-1) {
85 /* Conversion error. Escape as UTF-8b, and start over
86 in the initial shift state. */
87 *out++ = 0xdc00 + *in++;
88 argsize--;
89 memset(&mbs, 0, sizeof mbs);
90 continue;
91 }
92 if (*out >= 0xd800 && *out <= 0xdfff) {
93 /* Surrogate character. Escape the original
94 byte sequence with surrogateescape. */
95 argsize -= converted;
96 while (converted--)
97 *out++ = 0xdc00 + *in++;
98 continue;
99 }
100 /* successfully converted some bytes */
101 in += converted;
102 argsize -= converted;
103 out++;
104 }
105#else
106 /* Cannot use C locale for escaping; manually escape as if charset
107 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
108 correctly in the locale's charset, which must be an ASCII superset. */
109 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
110 if (!res) goto oom;
111 in = (unsigned char*)arg;
112 out = res;
113 while(*in)
114 if(*in < 128)
115 *out++ = *in++;
116 else
117 *out++ = 0xdc00 + *in++;
118 *out = 0;
119#endif
Victor Stinner168e1172010-10-16 23:16:16 +0000120 if (size != NULL)
121 *size = out - res;
Victor Stinner4e314432010-10-07 21:45:39 +0000122 return res;
123oom:
124 fprintf(stderr, "out of memory\n");
125 return NULL;
126}
127
128/* Encode a (wide) character string to the locale encoding with the
129 surrogateescape error handler (characters in range U+DC80..U+DCFF are
130 converted to bytes 0x80..0xFF).
131
132 This function is the reverse of _Py_char2wchar().
133
134 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
135 the memory), or NULL on error (conversion error or memory error). */
136char*
137_Py_wchar2char(const wchar_t *text)
138{
139 const size_t len = wcslen(text);
140 char *result = NULL, *bytes = NULL;
141 size_t i, size, converted;
142 wchar_t c, buf[2];
143
144 /* The function works in two steps:
145 1. compute the length of the output buffer in bytes (size)
146 2. outputs the bytes */
147 size = 0;
148 buf[1] = 0;
149 while (1) {
150 for (i=0; i < len; i++) {
151 c = text[i];
152 if (c >= 0xdc80 && c <= 0xdcff) {
153 /* UTF-8b surrogate */
154 if (bytes != NULL) {
155 *bytes++ = c - 0xdc00;
156 size--;
157 }
158 else
159 size++;
160 continue;
161 }
162 else {
163 buf[0] = c;
164 if (bytes != NULL)
165 converted = wcstombs(bytes, buf, size);
166 else
167 converted = wcstombs(NULL, buf, 0);
168 if (converted == (size_t)-1) {
169 if (result != NULL)
170 PyMem_Free(result);
171 return NULL;
172 }
173 if (bytes != NULL) {
174 bytes += converted;
175 size -= converted;
176 }
177 else
178 size += converted;
179 }
180 }
181 if (result != NULL) {
182 *bytes = 0;
183 break;
184 }
185
186 size += 1; /* nul byte at the end */
187 result = PyMem_Malloc(size);
188 if (result == NULL)
189 return NULL;
190 bytes = result;
191 }
192 return result;
193}
194
Victor Stinner4e314432010-10-07 21:45:39 +0000195/* In principle, this should use HAVE__WSTAT, and _wstat
196 should be detected by autoconf. However, no current
197 POSIX system provides that function, so testing for
198 it is pointless.
199 Not sure whether the MS_WINDOWS guards are necessary:
200 perhaps for cygwin/mingw builds?
201*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000202#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000203
204/* Get file status. Encode the path to the locale encoding. */
205
Victor Stinnerb306d752010-10-07 22:09:40 +0000206int
207_Py_wstat(const wchar_t* path, struct stat *buf)
208{
Victor Stinner4e314432010-10-07 21:45:39 +0000209 int err;
210 char *fname;
211 fname = _Py_wchar2char(path);
212 if (fname == NULL) {
213 errno = EINVAL;
214 return -1;
215 }
216 err = stat(fname, buf);
217 PyMem_Free(fname);
218 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000219}
220#endif
221
Victor Stinner6672d0c2010-10-07 22:53:43 +0000222/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
223 call stat() otherwise. Only fill st_mode attribute on Windows.
224
225 Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
226 unicode error. */
Victor Stinner4e314432010-10-07 21:45:39 +0000227
228int
Victor Stinnera4a75952010-10-07 22:23:10 +0000229_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000230{
231#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000232 int err;
233 struct _stat wstatbuf;
234
Victor Stinnera4a75952010-10-07 22:23:10 +0000235 err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000236 if (!err)
237 statbuf->st_mode = wstatbuf.st_mode;
238 return err;
239#else
240 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000241 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000242 if (bytes == NULL)
243 return -1;
244 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
245 Py_DECREF(bytes);
246 return ret;
247#endif
248}
249
Victor Stinner6672d0c2010-10-07 22:53:43 +0000250/* Open a file. Use _wfopen() on Windows, encode the path to the locale
251 encoding and use fopen() otherwise. */
252
Victor Stinner4e314432010-10-07 21:45:39 +0000253FILE *
254_Py_wfopen(const wchar_t *path, const wchar_t *mode)
255{
256#ifndef MS_WINDOWS
257 FILE *f;
258 char *cpath;
259 char cmode[10];
260 size_t r;
261 r = wcstombs(cmode, mode, 10);
262 if (r == (size_t)-1 || r >= 10) {
263 errno = EINVAL;
264 return NULL;
265 }
266 cpath = _Py_wchar2char(path);
267 if (cpath == NULL)
268 return NULL;
269 f = fopen(cpath, cmode);
270 PyMem_Free(cpath);
271 return f;
272#else
273 return _wfopen(path, mode);
274#endif
275}
276
Victor Stinner6672d0c2010-10-07 22:53:43 +0000277/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
278 call fopen() otherwise.
279
280 Return the new file object on success, or NULL if the file cannot be open or
281 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000282
283FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000284_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000285{
286#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000287 wchar_t wmode[10];
288 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000289
290 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
291 if (usize == 0)
292 return NULL;
293
Victor Stinnera4a75952010-10-07 22:23:10 +0000294 return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000295#else
296 FILE *f;
Victor Stinnera4a75952010-10-07 22:23:10 +0000297 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000298 if (bytes == NULL)
299 return NULL;
300 f = fopen(PyBytes_AS_STRING(bytes), mode);
301 Py_DECREF(bytes);
302 return f;
303#endif
304}
305
306#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000307
308/* Read value of symbolic link. Encode the path to the locale encoding, decode
309 the result from the locale encoding. */
310
Victor Stinner4e314432010-10-07 21:45:39 +0000311int
312_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
313{
314 char *cpath;
315 char cbuf[PATH_MAX];
Victor Stinner3f711f42010-10-16 22:47:37 +0000316 wchar_t *wbuf;
Victor Stinner4e314432010-10-07 21:45:39 +0000317 int res;
318 size_t r1;
319
320 cpath = _Py_wchar2char(path);
321 if (cpath == NULL) {
322 errno = EINVAL;
323 return -1;
324 }
325 res = (int)readlink(cpath, cbuf, PATH_MAX);
326 PyMem_Free(cpath);
327 if (res == -1)
328 return -1;
329 if (res == PATH_MAX) {
330 errno = EINVAL;
331 return -1;
332 }
333 cbuf[res] = '\0'; /* buf will be null terminated */
Victor Stinner168e1172010-10-16 23:16:16 +0000334 wbuf = _Py_char2wchar(cbuf, &r1);
Victor Stinner350147b2010-10-16 22:52:09 +0000335 if (wbuf == NULL) {
336 errno = EINVAL;
337 return -1;
338 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000339 if (bufsiz <= r1) {
340 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000341 errno = EINVAL;
342 return -1;
343 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000344 wcsncpy(buf, wbuf, bufsiz);
345 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000346 return (int)r1;
347}
348#endif
349
350#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000351
352/* Return the canonicalized absolute pathname. Encode path to the locale
353 encoding, decode the result from the locale encoding. */
354
Victor Stinner4e314432010-10-07 21:45:39 +0000355wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000356_Py_wrealpath(const wchar_t *path,
357 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000358{
359 char *cpath;
360 char cresolved_path[PATH_MAX];
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000361 wchar_t *wresolved_path;
Victor Stinner4e314432010-10-07 21:45:39 +0000362 char *res;
363 size_t r;
364 cpath = _Py_wchar2char(path);
365 if (cpath == NULL) {
366 errno = EINVAL;
367 return NULL;
368 }
369 res = realpath(cpath, cresolved_path);
370 PyMem_Free(cpath);
371 if (res == NULL)
372 return NULL;
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000373
Victor Stinner168e1172010-10-16 23:16:16 +0000374 wresolved_path = _Py_char2wchar(cresolved_path, &r);
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000375 if (wresolved_path == NULL) {
Victor Stinner4e314432010-10-07 21:45:39 +0000376 errno = EINVAL;
377 return NULL;
378 }
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000379 if (resolved_path_size <= r) {
380 PyMem_Free(wresolved_path);
381 errno = EINVAL;
382 return NULL;
383 }
384 wcsncpy(resolved_path, wresolved_path, resolved_path_size);
385 PyMem_Free(wresolved_path);
Victor Stinner4e314432010-10-07 21:45:39 +0000386 return resolved_path;
387}
388#endif
389
Victor Stinnerf4061da2010-10-14 12:37:19 +0000390/* Get the current directory. size is the buffer size in wide characters
391 including the null character. Decode the path from the locale encoding. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000392
Victor Stinner4e314432010-10-07 21:45:39 +0000393wchar_t*
394_Py_wgetcwd(wchar_t *buf, size_t size)
395{
396#ifdef MS_WINDOWS
397 return _wgetcwd(buf, size);
398#else
399 char fname[PATH_MAX];
Victor Stinnerf4061da2010-10-14 12:37:19 +0000400 wchar_t *wname;
Victor Stinner168e1172010-10-16 23:16:16 +0000401 size_t len;
Victor Stinnerf4061da2010-10-14 12:37:19 +0000402
Victor Stinner4e314432010-10-07 21:45:39 +0000403 if (getcwd(fname, PATH_MAX) == NULL)
404 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000405 wname = _Py_char2wchar(fname, &len);
Victor Stinnerf4061da2010-10-14 12:37:19 +0000406 if (wname == NULL)
407 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000408 if (size <= len) {
Victor Stinnerf4061da2010-10-14 12:37:19 +0000409 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000410 return NULL;
411 }
Victor Stinnerf4061da2010-10-14 12:37:19 +0000412 wcsncpy(buf, wname, size);
413 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000414 return buf;
415#endif
416}
417
418#endif