blob: 0aad2200fb1fe64fc06ac3079b3556e456b3f889 [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
6#ifdef HAVE_STAT
7
8/* Decode a byte string from the locale encoding with the
9 surrogateescape error handler (undecodable bytes are decoded as characters
10 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
11 character, escape the bytes using the surrogateescape error handler instead
12 of decoding them.
13
14 Use _Py_wchar2char() to encode the character string back to a byte string.
15
Victor Stinner168e1172010-10-16 23:16:16 +000016 Return a pointer to a newly allocated wide character string (use
17 PyMem_Free() to free the memory) and write the number of written wide
18 characters excluding the null character into *size if size is not NULL, or
Victor Stinneraf02e1c2011-12-16 23:56:01 +010019 NULL on error (decoding or memory allocation error). If size is not NULL,
20 *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding
21 error.
Victor Stinner19de4c32010-11-08 23:30:46 +000022
23 Conversion errors should never happen, unless there is a bug in the C
24 library. */
Victor Stinner4e314432010-10-07 21:45:39 +000025wchar_t*
Victor Stinner168e1172010-10-16 23:16:16 +000026_Py_char2wchar(const char* arg, size_t *size)
Victor Stinner4e314432010-10-07 21:45:39 +000027{
28 wchar_t *res;
29#ifdef HAVE_BROKEN_MBSTOWCS
30 /* Some platforms have a broken implementation of
31 * mbstowcs which does not count the characters that
32 * would result from conversion. Use an upper bound.
33 */
34 size_t argsize = strlen(arg);
35#else
36 size_t argsize = mbstowcs(NULL, arg, 0);
37#endif
38 size_t count;
39 unsigned char *in;
40 wchar_t *out;
41#ifdef HAVE_MBRTOWC
42 mbstate_t mbs;
43#endif
44 if (argsize != (size_t)-1) {
45 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
46 if (!res)
47 goto oom;
48 count = mbstowcs(res, arg, argsize+1);
49 if (count != (size_t)-1) {
50 wchar_t *tmp;
51 /* Only use the result if it contains no
52 surrogate characters. */
53 for (tmp = res; *tmp != 0 &&
54 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
55 ;
Victor Stinner168e1172010-10-16 23:16:16 +000056 if (*tmp == 0) {
57 if (size != NULL)
58 *size = count;
Victor Stinner4e314432010-10-07 21:45:39 +000059 return res;
Victor Stinner168e1172010-10-16 23:16:16 +000060 }
Victor Stinner4e314432010-10-07 21:45:39 +000061 }
62 PyMem_Free(res);
63 }
64 /* Conversion failed. Fall back to escaping with surrogateescape. */
65#ifdef HAVE_MBRTOWC
66 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
67
68 /* Overallocate; as multi-byte characters are in the argument, the
69 actual output could use less memory. */
70 argsize = strlen(arg) + 1;
71 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
Victor Stinner19de4c32010-11-08 23:30:46 +000072 if (!res)
73 goto oom;
Victor Stinner4e314432010-10-07 21:45:39 +000074 in = (unsigned char*)arg;
75 out = res;
76 memset(&mbs, 0, sizeof mbs);
77 while (argsize) {
78 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
79 if (converted == 0)
80 /* Reached end of string; null char stored. */
81 break;
82 if (converted == (size_t)-2) {
83 /* Incomplete character. This should never happen,
84 since we provide everything that we have -
85 unless there is a bug in the C library, or I
86 misunderstood how mbrtowc works. */
Victor Stinner19de4c32010-11-08 23:30:46 +000087 PyMem_Free(res);
Victor Stinneraf02e1c2011-12-16 23:56:01 +010088 if (size != NULL)
89 *size = (size_t)-2;
Victor Stinner4e314432010-10-07 21:45:39 +000090 return NULL;
91 }
92 if (converted == (size_t)-1) {
93 /* Conversion error. Escape as UTF-8b, and start over
94 in the initial shift state. */
95 *out++ = 0xdc00 + *in++;
96 argsize--;
97 memset(&mbs, 0, sizeof mbs);
98 continue;
99 }
100 if (*out >= 0xd800 && *out <= 0xdfff) {
101 /* Surrogate character. Escape the original
102 byte sequence with surrogateescape. */
103 argsize -= converted;
104 while (converted--)
105 *out++ = 0xdc00 + *in++;
106 continue;
107 }
108 /* successfully converted some bytes */
109 in += converted;
110 argsize -= converted;
111 out++;
112 }
113#else
114 /* Cannot use C locale for escaping; manually escape as if charset
115 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
116 correctly in the locale's charset, which must be an ASCII superset. */
117 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100118 if (!res)
119 goto oom;
Victor Stinner4e314432010-10-07 21:45:39 +0000120 in = (unsigned char*)arg;
121 out = res;
122 while(*in)
123 if(*in < 128)
124 *out++ = *in++;
125 else
126 *out++ = 0xdc00 + *in++;
127 *out = 0;
128#endif
Victor Stinner168e1172010-10-16 23:16:16 +0000129 if (size != NULL)
130 *size = out - res;
Victor Stinner4e314432010-10-07 21:45:39 +0000131 return res;
132oom:
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100133 if (size != NULL)
134 *size = (size_t)-1;
Victor Stinner4e314432010-10-07 21:45:39 +0000135 return NULL;
136}
137
138/* Encode a (wide) character string to the locale encoding with the
139 surrogateescape error handler (characters in range U+DC80..U+DCFF are
140 converted to bytes 0x80..0xFF).
141
142 This function is the reverse of _Py_char2wchar().
143
144 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100145 the memory), or NULL on encoding or memory allocation error.
Victor Stinner2f02a512010-11-08 22:43:46 +0000146
147 If error_pos is not NULL: *error_pos is the index of the invalid character
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100148 on encoding error, or (size_t)-1 otherwise. */
Victor Stinner4e314432010-10-07 21:45:39 +0000149char*
Victor Stinner2f02a512010-11-08 22:43:46 +0000150_Py_wchar2char(const wchar_t *text, size_t *error_pos)
Victor Stinner4e314432010-10-07 21:45:39 +0000151{
152 const size_t len = wcslen(text);
153 char *result = NULL, *bytes = NULL;
154 size_t i, size, converted;
155 wchar_t c, buf[2];
156
Victor Stinner2f02a512010-11-08 22:43:46 +0000157 if (error_pos != NULL)
158 *error_pos = (size_t)-1;
159
Victor Stinner4e314432010-10-07 21:45:39 +0000160 /* The function works in two steps:
161 1. compute the length of the output buffer in bytes (size)
162 2. outputs the bytes */
163 size = 0;
164 buf[1] = 0;
165 while (1) {
166 for (i=0; i < len; i++) {
167 c = text[i];
168 if (c >= 0xdc80 && c <= 0xdcff) {
169 /* UTF-8b surrogate */
170 if (bytes != NULL) {
171 *bytes++ = c - 0xdc00;
172 size--;
173 }
174 else
175 size++;
176 continue;
177 }
178 else {
179 buf[0] = c;
180 if (bytes != NULL)
181 converted = wcstombs(bytes, buf, size);
182 else
183 converted = wcstombs(NULL, buf, 0);
184 if (converted == (size_t)-1) {
185 if (result != NULL)
186 PyMem_Free(result);
Victor Stinner2f02a512010-11-08 22:43:46 +0000187 if (error_pos != NULL)
188 *error_pos = i;
Victor Stinner4e314432010-10-07 21:45:39 +0000189 return NULL;
190 }
191 if (bytes != NULL) {
192 bytes += converted;
193 size -= converted;
194 }
195 else
196 size += converted;
197 }
198 }
199 if (result != NULL) {
200 *bytes = 0;
201 break;
202 }
203
204 size += 1; /* nul byte at the end */
205 result = PyMem_Malloc(size);
206 if (result == NULL)
207 return NULL;
208 bytes = result;
209 }
210 return result;
211}
212
Victor Stinner4e314432010-10-07 21:45:39 +0000213/* In principle, this should use HAVE__WSTAT, and _wstat
214 should be detected by autoconf. However, no current
215 POSIX system provides that function, so testing for
216 it is pointless.
217 Not sure whether the MS_WINDOWS guards are necessary:
218 perhaps for cygwin/mingw builds?
219*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000220#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000221
222/* Get file status. Encode the path to the locale encoding. */
223
Victor Stinnerb306d752010-10-07 22:09:40 +0000224int
225_Py_wstat(const wchar_t* path, struct stat *buf)
226{
Victor Stinner4e314432010-10-07 21:45:39 +0000227 int err;
228 char *fname;
Victor Stinner2f02a512010-11-08 22:43:46 +0000229 fname = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000230 if (fname == NULL) {
231 errno = EINVAL;
232 return -1;
233 }
234 err = stat(fname, buf);
235 PyMem_Free(fname);
236 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000237}
238#endif
239
Victor Stinner6672d0c2010-10-07 22:53:43 +0000240/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
241 call stat() otherwise. Only fill st_mode attribute on Windows.
242
243 Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
244 unicode error. */
Victor Stinner4e314432010-10-07 21:45:39 +0000245
246int
Victor Stinnera4a75952010-10-07 22:23:10 +0000247_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000248{
249#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000250 int err;
251 struct _stat wstatbuf;
Victor Stinneree587ea2011-11-17 00:51:38 +0100252 wchar_t *wpath;
Victor Stinner4e314432010-10-07 21:45:39 +0000253
Victor Stinneree587ea2011-11-17 00:51:38 +0100254 wpath = PyUnicode_AsUnicode(path);
255 if (wpath == NULL)
256 return -1;
257 err = _wstat(wpath, &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000258 if (!err)
259 statbuf->st_mode = wstatbuf.st_mode;
260 return err;
261#else
262 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000263 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000264 if (bytes == NULL)
265 return -1;
266 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
267 Py_DECREF(bytes);
268 return ret;
269#endif
270}
271
Victor Stinner6672d0c2010-10-07 22:53:43 +0000272/* Open a file. Use _wfopen() on Windows, encode the path to the locale
273 encoding and use fopen() otherwise. */
274
Victor Stinner4e314432010-10-07 21:45:39 +0000275FILE *
276_Py_wfopen(const wchar_t *path, const wchar_t *mode)
277{
278#ifndef MS_WINDOWS
279 FILE *f;
280 char *cpath;
281 char cmode[10];
282 size_t r;
283 r = wcstombs(cmode, mode, 10);
284 if (r == (size_t)-1 || r >= 10) {
285 errno = EINVAL;
286 return NULL;
287 }
Victor Stinner2f02a512010-11-08 22:43:46 +0000288 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000289 if (cpath == NULL)
290 return NULL;
291 f = fopen(cpath, cmode);
292 PyMem_Free(cpath);
293 return f;
294#else
295 return _wfopen(path, mode);
296#endif
297}
298
Victor Stinner6672d0c2010-10-07 22:53:43 +0000299/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
300 call fopen() otherwise.
301
302 Return the new file object on success, or NULL if the file cannot be open or
303 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000304
305FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000306_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000307{
308#ifdef MS_WINDOWS
Victor Stinneree587ea2011-11-17 00:51:38 +0100309 wchar_t *wpath;
Victor Stinner4e314432010-10-07 21:45:39 +0000310 wchar_t wmode[10];
311 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000312
Victor Stinneree587ea2011-11-17 00:51:38 +0100313 wpath = PyUnicode_AsUnicode(path);
314 if (wpath == NULL)
315 return NULL;
316
Victor Stinner4e314432010-10-07 21:45:39 +0000317 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
318 if (usize == 0)
319 return NULL;
320
Victor Stinneree587ea2011-11-17 00:51:38 +0100321 return _wfopen(wpath, wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000322#else
323 FILE *f;
Victor Stinnera4a75952010-10-07 22:23:10 +0000324 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000325 if (bytes == NULL)
326 return NULL;
327 f = fopen(PyBytes_AS_STRING(bytes), mode);
328 Py_DECREF(bytes);
329 return f;
330#endif
331}
332
333#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000334
335/* Read value of symbolic link. Encode the path to the locale encoding, decode
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100336 the result from the locale encoding. Return -1 on error. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000337
Victor Stinner4e314432010-10-07 21:45:39 +0000338int
339_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
340{
341 char *cpath;
342 char cbuf[PATH_MAX];
Victor Stinner3f711f42010-10-16 22:47:37 +0000343 wchar_t *wbuf;
Victor Stinner4e314432010-10-07 21:45:39 +0000344 int res;
345 size_t r1;
346
Victor Stinner2f02a512010-11-08 22:43:46 +0000347 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000348 if (cpath == NULL) {
349 errno = EINVAL;
350 return -1;
351 }
352 res = (int)readlink(cpath, cbuf, PATH_MAX);
353 PyMem_Free(cpath);
354 if (res == -1)
355 return -1;
356 if (res == PATH_MAX) {
357 errno = EINVAL;
358 return -1;
359 }
360 cbuf[res] = '\0'; /* buf will be null terminated */
Victor Stinner168e1172010-10-16 23:16:16 +0000361 wbuf = _Py_char2wchar(cbuf, &r1);
Victor Stinner350147b2010-10-16 22:52:09 +0000362 if (wbuf == NULL) {
363 errno = EINVAL;
364 return -1;
365 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000366 if (bufsiz <= r1) {
367 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000368 errno = EINVAL;
369 return -1;
370 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000371 wcsncpy(buf, wbuf, bufsiz);
372 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000373 return (int)r1;
374}
375#endif
376
377#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000378
379/* Return the canonicalized absolute pathname. Encode path to the locale
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100380 encoding, decode the result from the locale encoding.
381 Return NULL on error. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000382
Victor Stinner4e314432010-10-07 21:45:39 +0000383wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000384_Py_wrealpath(const wchar_t *path,
385 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000386{
387 char *cpath;
388 char cresolved_path[PATH_MAX];
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000389 wchar_t *wresolved_path;
Victor Stinner4e314432010-10-07 21:45:39 +0000390 char *res;
391 size_t r;
Victor Stinner2f02a512010-11-08 22:43:46 +0000392 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000393 if (cpath == NULL) {
394 errno = EINVAL;
395 return NULL;
396 }
397 res = realpath(cpath, cresolved_path);
398 PyMem_Free(cpath);
399 if (res == NULL)
400 return NULL;
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000401
Victor Stinner168e1172010-10-16 23:16:16 +0000402 wresolved_path = _Py_char2wchar(cresolved_path, &r);
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000403 if (wresolved_path == NULL) {
Victor Stinner4e314432010-10-07 21:45:39 +0000404 errno = EINVAL;
405 return NULL;
406 }
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000407 if (resolved_path_size <= r) {
408 PyMem_Free(wresolved_path);
409 errno = EINVAL;
410 return NULL;
411 }
412 wcsncpy(resolved_path, wresolved_path, resolved_path_size);
413 PyMem_Free(wresolved_path);
Victor Stinner4e314432010-10-07 21:45:39 +0000414 return resolved_path;
415}
416#endif
417
Victor Stinnerf4061da2010-10-14 12:37:19 +0000418/* Get the current directory. size is the buffer size in wide characters
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100419 including the null character. Decode the path from the locale encoding.
420 Return NULL on error. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000421
Victor Stinner4e314432010-10-07 21:45:39 +0000422wchar_t*
423_Py_wgetcwd(wchar_t *buf, size_t size)
424{
425#ifdef MS_WINDOWS
426 return _wgetcwd(buf, size);
427#else
428 char fname[PATH_MAX];
Victor Stinnerf4061da2010-10-14 12:37:19 +0000429 wchar_t *wname;
Victor Stinner168e1172010-10-16 23:16:16 +0000430 size_t len;
Victor Stinnerf4061da2010-10-14 12:37:19 +0000431
Victor Stinner4e314432010-10-07 21:45:39 +0000432 if (getcwd(fname, PATH_MAX) == NULL)
433 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000434 wname = _Py_char2wchar(fname, &len);
Victor Stinnerf4061da2010-10-14 12:37:19 +0000435 if (wname == NULL)
436 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000437 if (size <= len) {
Victor Stinnerf4061da2010-10-14 12:37:19 +0000438 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000439 return NULL;
440 }
Victor Stinnerf4061da2010-10-14 12:37:19 +0000441 wcsncpy(buf, wname, size);
442 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000443 return buf;
444#endif
445}
446
447#endif