blob: cfafd865c54d222906b9092b0f4db84322954fd6 [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
6#ifdef HAVE_STAT
7
8/* Decode a byte string from the locale encoding with the
9 surrogateescape error handler (undecodable bytes are decoded as characters
10 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
11 character, escape the bytes using the surrogateescape error handler instead
12 of decoding them.
13
14 Use _Py_wchar2char() to encode the character string back to a byte string.
15
16 Return a pointer to a newly allocated (wide) character string (use
17 PyMem_Free() to free the memory), or NULL on error (conversion error or
18 memory error). */
19wchar_t*
Victor Stinner5d1e34382010-10-15 11:15:54 +000020_Py_char2wchar(const char* arg)
Victor Stinner4e314432010-10-07 21:45:39 +000021{
22 wchar_t *res;
23#ifdef HAVE_BROKEN_MBSTOWCS
24 /* Some platforms have a broken implementation of
25 * mbstowcs which does not count the characters that
26 * would result from conversion. Use an upper bound.
27 */
28 size_t argsize = strlen(arg);
29#else
30 size_t argsize = mbstowcs(NULL, arg, 0);
31#endif
32 size_t count;
33 unsigned char *in;
34 wchar_t *out;
35#ifdef HAVE_MBRTOWC
36 mbstate_t mbs;
37#endif
38 if (argsize != (size_t)-1) {
39 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
40 if (!res)
41 goto oom;
42 count = mbstowcs(res, arg, argsize+1);
43 if (count != (size_t)-1) {
44 wchar_t *tmp;
45 /* Only use the result if it contains no
46 surrogate characters. */
47 for (tmp = res; *tmp != 0 &&
48 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
49 ;
50 if (*tmp == 0)
51 return res;
52 }
53 PyMem_Free(res);
54 }
55 /* Conversion failed. Fall back to escaping with surrogateescape. */
56#ifdef HAVE_MBRTOWC
57 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
58
59 /* Overallocate; as multi-byte characters are in the argument, the
60 actual output could use less memory. */
61 argsize = strlen(arg) + 1;
62 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
63 if (!res) goto oom;
64 in = (unsigned char*)arg;
65 out = res;
66 memset(&mbs, 0, sizeof mbs);
67 while (argsize) {
68 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
69 if (converted == 0)
70 /* Reached end of string; null char stored. */
71 break;
72 if (converted == (size_t)-2) {
73 /* Incomplete character. This should never happen,
74 since we provide everything that we have -
75 unless there is a bug in the C library, or I
76 misunderstood how mbrtowc works. */
77 fprintf(stderr, "unexpected mbrtowc result -2\n");
78 return NULL;
79 }
80 if (converted == (size_t)-1) {
81 /* Conversion error. Escape as UTF-8b, and start over
82 in the initial shift state. */
83 *out++ = 0xdc00 + *in++;
84 argsize--;
85 memset(&mbs, 0, sizeof mbs);
86 continue;
87 }
88 if (*out >= 0xd800 && *out <= 0xdfff) {
89 /* Surrogate character. Escape the original
90 byte sequence with surrogateescape. */
91 argsize -= converted;
92 while (converted--)
93 *out++ = 0xdc00 + *in++;
94 continue;
95 }
96 /* successfully converted some bytes */
97 in += converted;
98 argsize -= converted;
99 out++;
100 }
101#else
102 /* Cannot use C locale for escaping; manually escape as if charset
103 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
104 correctly in the locale's charset, which must be an ASCII superset. */
105 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
106 if (!res) goto oom;
107 in = (unsigned char*)arg;
108 out = res;
109 while(*in)
110 if(*in < 128)
111 *out++ = *in++;
112 else
113 *out++ = 0xdc00 + *in++;
114 *out = 0;
115#endif
116 return res;
117oom:
118 fprintf(stderr, "out of memory\n");
119 return NULL;
120}
121
122/* Encode a (wide) character string to the locale encoding with the
123 surrogateescape error handler (characters in range U+DC80..U+DCFF are
124 converted to bytes 0x80..0xFF).
125
126 This function is the reverse of _Py_char2wchar().
127
128 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
129 the memory), or NULL on error (conversion error or memory error). */
130char*
131_Py_wchar2char(const wchar_t *text)
132{
133 const size_t len = wcslen(text);
134 char *result = NULL, *bytes = NULL;
135 size_t i, size, converted;
136 wchar_t c, buf[2];
137
138 /* The function works in two steps:
139 1. compute the length of the output buffer in bytes (size)
140 2. outputs the bytes */
141 size = 0;
142 buf[1] = 0;
143 while (1) {
144 for (i=0; i < len; i++) {
145 c = text[i];
146 if (c >= 0xdc80 && c <= 0xdcff) {
147 /* UTF-8b surrogate */
148 if (bytes != NULL) {
149 *bytes++ = c - 0xdc00;
150 size--;
151 }
152 else
153 size++;
154 continue;
155 }
156 else {
157 buf[0] = c;
158 if (bytes != NULL)
159 converted = wcstombs(bytes, buf, size);
160 else
161 converted = wcstombs(NULL, buf, 0);
162 if (converted == (size_t)-1) {
163 if (result != NULL)
164 PyMem_Free(result);
165 return NULL;
166 }
167 if (bytes != NULL) {
168 bytes += converted;
169 size -= converted;
170 }
171 else
172 size += converted;
173 }
174 }
175 if (result != NULL) {
176 *bytes = 0;
177 break;
178 }
179
180 size += 1; /* nul byte at the end */
181 result = PyMem_Malloc(size);
182 if (result == NULL)
183 return NULL;
184 bytes = result;
185 }
186 return result;
187}
188
Victor Stinner4e314432010-10-07 21:45:39 +0000189/* In principle, this should use HAVE__WSTAT, and _wstat
190 should be detected by autoconf. However, no current
191 POSIX system provides that function, so testing for
192 it is pointless.
193 Not sure whether the MS_WINDOWS guards are necessary:
194 perhaps for cygwin/mingw builds?
195*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000196#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000197
198/* Get file status. Encode the path to the locale encoding. */
199
Victor Stinnerb306d752010-10-07 22:09:40 +0000200int
201_Py_wstat(const wchar_t* path, struct stat *buf)
202{
Victor Stinner4e314432010-10-07 21:45:39 +0000203 int err;
204 char *fname;
205 fname = _Py_wchar2char(path);
206 if (fname == NULL) {
207 errno = EINVAL;
208 return -1;
209 }
210 err = stat(fname, buf);
211 PyMem_Free(fname);
212 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000213}
214#endif
215
Victor Stinner6672d0c2010-10-07 22:53:43 +0000216/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
217 call stat() otherwise. Only fill st_mode attribute on Windows.
218
219 Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
220 unicode error. */
Victor Stinner4e314432010-10-07 21:45:39 +0000221
222int
Victor Stinnera4a75952010-10-07 22:23:10 +0000223_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000224{
225#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000226 int err;
227 struct _stat wstatbuf;
228
Victor Stinnera4a75952010-10-07 22:23:10 +0000229 err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000230 if (!err)
231 statbuf->st_mode = wstatbuf.st_mode;
232 return err;
233#else
234 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000235 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000236 if (bytes == NULL)
237 return -1;
238 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
239 Py_DECREF(bytes);
240 return ret;
241#endif
242}
243
Victor Stinner6672d0c2010-10-07 22:53:43 +0000244/* Open a file. Use _wfopen() on Windows, encode the path to the locale
245 encoding and use fopen() otherwise. */
246
Victor Stinner4e314432010-10-07 21:45:39 +0000247FILE *
248_Py_wfopen(const wchar_t *path, const wchar_t *mode)
249{
250#ifndef MS_WINDOWS
251 FILE *f;
252 char *cpath;
253 char cmode[10];
254 size_t r;
255 r = wcstombs(cmode, mode, 10);
256 if (r == (size_t)-1 || r >= 10) {
257 errno = EINVAL;
258 return NULL;
259 }
260 cpath = _Py_wchar2char(path);
261 if (cpath == NULL)
262 return NULL;
263 f = fopen(cpath, cmode);
264 PyMem_Free(cpath);
265 return f;
266#else
267 return _wfopen(path, mode);
268#endif
269}
270
Victor Stinner6672d0c2010-10-07 22:53:43 +0000271/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
272 call fopen() otherwise.
273
274 Return the new file object on success, or NULL if the file cannot be open or
275 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000276
277FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000278_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000279{
280#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000281 wchar_t wmode[10];
282 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000283
284 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
285 if (usize == 0)
286 return NULL;
287
Victor Stinnera4a75952010-10-07 22:23:10 +0000288 return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000289#else
290 FILE *f;
Victor Stinnera4a75952010-10-07 22:23:10 +0000291 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000292 if (bytes == NULL)
293 return NULL;
294 f = fopen(PyBytes_AS_STRING(bytes), mode);
295 Py_DECREF(bytes);
296 return f;
297#endif
298}
299
300#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000301
302/* Read value of symbolic link. Encode the path to the locale encoding, decode
303 the result from the locale encoding. */
304
Victor Stinner4e314432010-10-07 21:45:39 +0000305int
306_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
307{
308 char *cpath;
309 char cbuf[PATH_MAX];
Victor Stinner3f711f42010-10-16 22:47:37 +0000310 wchar_t *wbuf;
Victor Stinner4e314432010-10-07 21:45:39 +0000311 int res;
312 size_t r1;
313
314 cpath = _Py_wchar2char(path);
315 if (cpath == NULL) {
316 errno = EINVAL;
317 return -1;
318 }
319 res = (int)readlink(cpath, cbuf, PATH_MAX);
320 PyMem_Free(cpath);
321 if (res == -1)
322 return -1;
323 if (res == PATH_MAX) {
324 errno = EINVAL;
325 return -1;
326 }
327 cbuf[res] = '\0'; /* buf will be null terminated */
Victor Stinner3f711f42010-10-16 22:47:37 +0000328 wbuf = _Py_char2wchar(cbuf);
329 r1 = wcslen(wbuf);
330 if (bufsiz <= r1) {
331 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000332 errno = EINVAL;
333 return -1;
334 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000335 wcsncpy(buf, wbuf, bufsiz);
336 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000337 return (int)r1;
338}
339#endif
340
341#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000342
343/* Return the canonicalized absolute pathname. Encode path to the locale
344 encoding, decode the result from the locale encoding. */
345
Victor Stinner4e314432010-10-07 21:45:39 +0000346wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000347_Py_wrealpath(const wchar_t *path,
348 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000349{
350 char *cpath;
351 char cresolved_path[PATH_MAX];
352 char *res;
353 size_t r;
354 cpath = _Py_wchar2char(path);
355 if (cpath == NULL) {
356 errno = EINVAL;
357 return NULL;
358 }
359 res = realpath(cpath, cresolved_path);
360 PyMem_Free(cpath);
361 if (res == NULL)
362 return NULL;
Victor Stinner015f4d82010-10-07 22:29:53 +0000363 r = mbstowcs(resolved_path, cresolved_path, resolved_path_size);
Victor Stinner4e314432010-10-07 21:45:39 +0000364 if (r == (size_t)-1 || r >= PATH_MAX) {
365 errno = EINVAL;
366 return NULL;
367 }
368 return resolved_path;
369}
370#endif
371
Victor Stinnerf4061da2010-10-14 12:37:19 +0000372/* Get the current directory. size is the buffer size in wide characters
373 including the null character. Decode the path from the locale encoding. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000374
Victor Stinner4e314432010-10-07 21:45:39 +0000375wchar_t*
376_Py_wgetcwd(wchar_t *buf, size_t size)
377{
378#ifdef MS_WINDOWS
379 return _wgetcwd(buf, size);
380#else
381 char fname[PATH_MAX];
Victor Stinnerf4061da2010-10-14 12:37:19 +0000382 wchar_t *wname;
383
Victor Stinner4e314432010-10-07 21:45:39 +0000384 if (getcwd(fname, PATH_MAX) == NULL)
385 return NULL;
Victor Stinnerf4061da2010-10-14 12:37:19 +0000386 wname = _Py_char2wchar(fname);
387 if (wname == NULL)
388 return NULL;
389 if (size <= wcslen(wname)) {
390 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000391 return NULL;
392 }
Victor Stinnerf4061da2010-10-14 12:37:19 +0000393 wcsncpy(buf, wname, size);
394 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000395 return buf;
396#endif
397}
398
399#endif