blob: 8993c8c49742bbdd9953ab7e73be607a6e74d2ae [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
6#ifdef HAVE_STAT
7
8/* Decode a byte string from the locale encoding with the
9 surrogateescape error handler (undecodable bytes are decoded as characters
10 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
11 character, escape the bytes using the surrogateescape error handler instead
12 of decoding them.
13
14 Use _Py_wchar2char() to encode the character string back to a byte string.
15
Victor Stinner168e1172010-10-16 23:16:16 +000016 Return a pointer to a newly allocated wide character string (use
17 PyMem_Free() to free the memory) and write the number of written wide
18 characters excluding the null character into *size if size is not NULL, or
Victor Stinneraf02e1c2011-12-16 23:56:01 +010019 NULL on error (decoding or memory allocation error). If size is not NULL,
20 *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding
21 error.
Victor Stinner19de4c32010-11-08 23:30:46 +000022
23 Conversion errors should never happen, unless there is a bug in the C
24 library. */
Victor Stinner4e314432010-10-07 21:45:39 +000025wchar_t*
Victor Stinner168e1172010-10-16 23:16:16 +000026_Py_char2wchar(const char* arg, size_t *size)
Victor Stinner4e314432010-10-07 21:45:39 +000027{
28 wchar_t *res;
29#ifdef HAVE_BROKEN_MBSTOWCS
30 /* Some platforms have a broken implementation of
31 * mbstowcs which does not count the characters that
32 * would result from conversion. Use an upper bound.
33 */
34 size_t argsize = strlen(arg);
35#else
36 size_t argsize = mbstowcs(NULL, arg, 0);
37#endif
38 size_t count;
39 unsigned char *in;
40 wchar_t *out;
41#ifdef HAVE_MBRTOWC
42 mbstate_t mbs;
43#endif
44 if (argsize != (size_t)-1) {
45 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
46 if (!res)
47 goto oom;
48 count = mbstowcs(res, arg, argsize+1);
49 if (count != (size_t)-1) {
50 wchar_t *tmp;
51 /* Only use the result if it contains no
52 surrogate characters. */
53 for (tmp = res; *tmp != 0 &&
54 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
55 ;
Victor Stinner168e1172010-10-16 23:16:16 +000056 if (*tmp == 0) {
57 if (size != NULL)
58 *size = count;
Victor Stinner4e314432010-10-07 21:45:39 +000059 return res;
Victor Stinner168e1172010-10-16 23:16:16 +000060 }
Victor Stinner4e314432010-10-07 21:45:39 +000061 }
62 PyMem_Free(res);
63 }
64 /* Conversion failed. Fall back to escaping with surrogateescape. */
65#ifdef HAVE_MBRTOWC
66 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
67
68 /* Overallocate; as multi-byte characters are in the argument, the
69 actual output could use less memory. */
70 argsize = strlen(arg) + 1;
71 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
Victor Stinner19de4c32010-11-08 23:30:46 +000072 if (!res)
73 goto oom;
Victor Stinner4e314432010-10-07 21:45:39 +000074 in = (unsigned char*)arg;
75 out = res;
76 memset(&mbs, 0, sizeof mbs);
77 while (argsize) {
78 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
79 if (converted == 0)
80 /* Reached end of string; null char stored. */
81 break;
82 if (converted == (size_t)-2) {
83 /* Incomplete character. This should never happen,
84 since we provide everything that we have -
85 unless there is a bug in the C library, or I
86 misunderstood how mbrtowc works. */
Victor Stinner19de4c32010-11-08 23:30:46 +000087 PyMem_Free(res);
Victor Stinneraf02e1c2011-12-16 23:56:01 +010088 if (size != NULL)
89 *size = (size_t)-2;
Victor Stinner4e314432010-10-07 21:45:39 +000090 return NULL;
91 }
92 if (converted == (size_t)-1) {
93 /* Conversion error. Escape as UTF-8b, and start over
94 in the initial shift state. */
95 *out++ = 0xdc00 + *in++;
96 argsize--;
97 memset(&mbs, 0, sizeof mbs);
98 continue;
99 }
100 if (*out >= 0xd800 && *out <= 0xdfff) {
101 /* Surrogate character. Escape the original
102 byte sequence with surrogateescape. */
103 argsize -= converted;
104 while (converted--)
105 *out++ = 0xdc00 + *in++;
106 continue;
107 }
108 /* successfully converted some bytes */
109 in += converted;
110 argsize -= converted;
111 out++;
112 }
113#else
114 /* Cannot use C locale for escaping; manually escape as if charset
115 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
116 correctly in the locale's charset, which must be an ASCII superset. */
117 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100118 if (!res)
119 goto oom;
Victor Stinner4e314432010-10-07 21:45:39 +0000120 in = (unsigned char*)arg;
121 out = res;
122 while(*in)
123 if(*in < 128)
124 *out++ = *in++;
125 else
126 *out++ = 0xdc00 + *in++;
127 *out = 0;
128#endif
Victor Stinner168e1172010-10-16 23:16:16 +0000129 if (size != NULL)
130 *size = out - res;
Victor Stinner4e314432010-10-07 21:45:39 +0000131 return res;
132oom:
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100133 if (size != NULL)
134 *size = (size_t)-1;
Victor Stinner4e314432010-10-07 21:45:39 +0000135 return NULL;
136}
137
138/* Encode a (wide) character string to the locale encoding with the
139 surrogateescape error handler (characters in range U+DC80..U+DCFF are
140 converted to bytes 0x80..0xFF).
141
142 This function is the reverse of _Py_char2wchar().
143
144 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100145 the memory), or NULL on encoding or memory allocation error.
Victor Stinner2f02a512010-11-08 22:43:46 +0000146
147 If error_pos is not NULL: *error_pos is the index of the invalid character
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100148 on encoding error, or (size_t)-1 otherwise. */
Victor Stinner4e314432010-10-07 21:45:39 +0000149char*
Victor Stinner2f02a512010-11-08 22:43:46 +0000150_Py_wchar2char(const wchar_t *text, size_t *error_pos)
Victor Stinner4e314432010-10-07 21:45:39 +0000151{
152 const size_t len = wcslen(text);
153 char *result = NULL, *bytes = NULL;
154 size_t i, size, converted;
155 wchar_t c, buf[2];
156
Victor Stinner2f02a512010-11-08 22:43:46 +0000157 if (error_pos != NULL)
158 *error_pos = (size_t)-1;
159
Victor Stinner4e314432010-10-07 21:45:39 +0000160 /* The function works in two steps:
161 1. compute the length of the output buffer in bytes (size)
162 2. outputs the bytes */
163 size = 0;
164 buf[1] = 0;
165 while (1) {
166 for (i=0; i < len; i++) {
167 c = text[i];
168 if (c >= 0xdc80 && c <= 0xdcff) {
169 /* UTF-8b surrogate */
170 if (bytes != NULL) {
171 *bytes++ = c - 0xdc00;
172 size--;
173 }
174 else
175 size++;
176 continue;
177 }
178 else {
179 buf[0] = c;
180 if (bytes != NULL)
181 converted = wcstombs(bytes, buf, size);
182 else
183 converted = wcstombs(NULL, buf, 0);
184 if (converted == (size_t)-1) {
185 if (result != NULL)
186 PyMem_Free(result);
Victor Stinner2f02a512010-11-08 22:43:46 +0000187 if (error_pos != NULL)
188 *error_pos = i;
Victor Stinner4e314432010-10-07 21:45:39 +0000189 return NULL;
190 }
191 if (bytes != NULL) {
192 bytes += converted;
193 size -= converted;
194 }
195 else
196 size += converted;
197 }
198 }
199 if (result != NULL) {
200 *bytes = 0;
201 break;
202 }
203
204 size += 1; /* nul byte at the end */
205 result = PyMem_Malloc(size);
206 if (result == NULL)
207 return NULL;
208 bytes = result;
209 }
210 return result;
211}
212
Victor Stinner4e314432010-10-07 21:45:39 +0000213/* In principle, this should use HAVE__WSTAT, and _wstat
214 should be detected by autoconf. However, no current
215 POSIX system provides that function, so testing for
216 it is pointless.
217 Not sure whether the MS_WINDOWS guards are necessary:
218 perhaps for cygwin/mingw builds?
219*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000220#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000221
222/* Get file status. Encode the path to the locale encoding. */
223
Victor Stinnerb306d752010-10-07 22:09:40 +0000224int
225_Py_wstat(const wchar_t* path, struct stat *buf)
226{
Victor Stinner4e314432010-10-07 21:45:39 +0000227 int err;
228 char *fname;
Victor Stinner2f02a512010-11-08 22:43:46 +0000229 fname = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000230 if (fname == NULL) {
231 errno = EINVAL;
232 return -1;
233 }
234 err = stat(fname, buf);
235 PyMem_Free(fname);
236 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000237}
238#endif
239
Victor Stinner6672d0c2010-10-07 22:53:43 +0000240/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
241 call stat() otherwise. Only fill st_mode attribute on Windows.
242
Victor Stinnerbd0850b2011-12-18 20:47:30 +0100243 Return 0 on success, -1 on _wstat() / stat() error, -2 if an exception was
244 raised. */
Victor Stinner4e314432010-10-07 21:45:39 +0000245
246int
Victor Stinnera4a75952010-10-07 22:23:10 +0000247_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000248{
249#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000250 int err;
251 struct _stat wstatbuf;
Victor Stinneree587ea2011-11-17 00:51:38 +0100252 wchar_t *wpath;
Victor Stinner4e314432010-10-07 21:45:39 +0000253
Victor Stinneree587ea2011-11-17 00:51:38 +0100254 wpath = PyUnicode_AsUnicode(path);
255 if (wpath == NULL)
Victor Stinnerbd0850b2011-12-18 20:47:30 +0100256 return -2;
Victor Stinneree587ea2011-11-17 00:51:38 +0100257 err = _wstat(wpath, &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000258 if (!err)
259 statbuf->st_mode = wstatbuf.st_mode;
260 return err;
261#else
262 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000263 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000264 if (bytes == NULL)
Victor Stinnerbd0850b2011-12-18 20:47:30 +0100265 return -2;
Victor Stinner4e314432010-10-07 21:45:39 +0000266 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
267 Py_DECREF(bytes);
268 return ret;
269#endif
270}
271
Victor Stinner6672d0c2010-10-07 22:53:43 +0000272/* Open a file. Use _wfopen() on Windows, encode the path to the locale
273 encoding and use fopen() otherwise. */
274
Victor Stinner4e314432010-10-07 21:45:39 +0000275FILE *
276_Py_wfopen(const wchar_t *path, const wchar_t *mode)
277{
278#ifndef MS_WINDOWS
279 FILE *f;
280 char *cpath;
281 char cmode[10];
282 size_t r;
283 r = wcstombs(cmode, mode, 10);
284 if (r == (size_t)-1 || r >= 10) {
285 errno = EINVAL;
286 return NULL;
287 }
Victor Stinner2f02a512010-11-08 22:43:46 +0000288 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000289 if (cpath == NULL)
290 return NULL;
291 f = fopen(cpath, cmode);
292 PyMem_Free(cpath);
293 return f;
294#else
295 return _wfopen(path, mode);
296#endif
297}
298
Victor Stinner6672d0c2010-10-07 22:53:43 +0000299/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
300 call fopen() otherwise.
301
302 Return the new file object on success, or NULL if the file cannot be open or
303 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000304
305FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000306_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000307{
308#ifdef MS_WINDOWS
Victor Stinneree587ea2011-11-17 00:51:38 +0100309 wchar_t *wpath;
Victor Stinner4e314432010-10-07 21:45:39 +0000310 wchar_t wmode[10];
311 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000312
Antoine Pitrou0e576f12011-12-22 10:03:38 +0100313 if (!PyUnicode_Check(path)) {
314 PyErr_Format(PyExc_TypeError,
315 "str file path expected under Windows, got %R",
316 Py_TYPE(path));
317 return NULL;
318 }
Victor Stinneree587ea2011-11-17 00:51:38 +0100319 wpath = PyUnicode_AsUnicode(path);
320 if (wpath == NULL)
321 return NULL;
322
Victor Stinner4e314432010-10-07 21:45:39 +0000323 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
324 if (usize == 0)
325 return NULL;
326
Victor Stinneree587ea2011-11-17 00:51:38 +0100327 return _wfopen(wpath, wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000328#else
329 FILE *f;
Antoine Pitrou2b1cc892011-12-19 18:19:06 +0100330 PyObject *bytes;
331 if (!PyUnicode_FSConverter(path, &bytes))
Victor Stinner4e314432010-10-07 21:45:39 +0000332 return NULL;
333 f = fopen(PyBytes_AS_STRING(bytes), mode);
334 Py_DECREF(bytes);
335 return f;
336#endif
337}
338
339#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000340
341/* Read value of symbolic link. Encode the path to the locale encoding, decode
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100342 the result from the locale encoding. Return -1 on error. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000343
Victor Stinner4e314432010-10-07 21:45:39 +0000344int
345_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
346{
347 char *cpath;
348 char cbuf[PATH_MAX];
Victor Stinner3f711f42010-10-16 22:47:37 +0000349 wchar_t *wbuf;
Victor Stinner4e314432010-10-07 21:45:39 +0000350 int res;
351 size_t r1;
352
Victor Stinner2f02a512010-11-08 22:43:46 +0000353 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000354 if (cpath == NULL) {
355 errno = EINVAL;
356 return -1;
357 }
358 res = (int)readlink(cpath, cbuf, PATH_MAX);
359 PyMem_Free(cpath);
360 if (res == -1)
361 return -1;
362 if (res == PATH_MAX) {
363 errno = EINVAL;
364 return -1;
365 }
366 cbuf[res] = '\0'; /* buf will be null terminated */
Victor Stinner168e1172010-10-16 23:16:16 +0000367 wbuf = _Py_char2wchar(cbuf, &r1);
Victor Stinner350147b2010-10-16 22:52:09 +0000368 if (wbuf == NULL) {
369 errno = EINVAL;
370 return -1;
371 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000372 if (bufsiz <= r1) {
373 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000374 errno = EINVAL;
375 return -1;
376 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000377 wcsncpy(buf, wbuf, bufsiz);
378 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000379 return (int)r1;
380}
381#endif
382
383#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000384
385/* Return the canonicalized absolute pathname. Encode path to the locale
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100386 encoding, decode the result from the locale encoding.
387 Return NULL on error. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000388
Victor Stinner4e314432010-10-07 21:45:39 +0000389wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000390_Py_wrealpath(const wchar_t *path,
391 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000392{
393 char *cpath;
394 char cresolved_path[PATH_MAX];
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000395 wchar_t *wresolved_path;
Victor Stinner4e314432010-10-07 21:45:39 +0000396 char *res;
397 size_t r;
Victor Stinner2f02a512010-11-08 22:43:46 +0000398 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000399 if (cpath == NULL) {
400 errno = EINVAL;
401 return NULL;
402 }
403 res = realpath(cpath, cresolved_path);
404 PyMem_Free(cpath);
405 if (res == NULL)
406 return NULL;
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000407
Victor Stinner168e1172010-10-16 23:16:16 +0000408 wresolved_path = _Py_char2wchar(cresolved_path, &r);
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000409 if (wresolved_path == NULL) {
Victor Stinner4e314432010-10-07 21:45:39 +0000410 errno = EINVAL;
411 return NULL;
412 }
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000413 if (resolved_path_size <= r) {
414 PyMem_Free(wresolved_path);
415 errno = EINVAL;
416 return NULL;
417 }
418 wcsncpy(resolved_path, wresolved_path, resolved_path_size);
419 PyMem_Free(wresolved_path);
Victor Stinner4e314432010-10-07 21:45:39 +0000420 return resolved_path;
421}
422#endif
423
Victor Stinnerf4061da2010-10-14 12:37:19 +0000424/* Get the current directory. size is the buffer size in wide characters
Victor Stinneraf02e1c2011-12-16 23:56:01 +0100425 including the null character. Decode the path from the locale encoding.
426 Return NULL on error. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000427
Victor Stinner4e314432010-10-07 21:45:39 +0000428wchar_t*
429_Py_wgetcwd(wchar_t *buf, size_t size)
430{
431#ifdef MS_WINDOWS
432 return _wgetcwd(buf, size);
433#else
434 char fname[PATH_MAX];
Victor Stinnerf4061da2010-10-14 12:37:19 +0000435 wchar_t *wname;
Victor Stinner168e1172010-10-16 23:16:16 +0000436 size_t len;
Victor Stinnerf4061da2010-10-14 12:37:19 +0000437
Victor Stinner4e314432010-10-07 21:45:39 +0000438 if (getcwd(fname, PATH_MAX) == NULL)
439 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000440 wname = _Py_char2wchar(fname, &len);
Victor Stinnerf4061da2010-10-14 12:37:19 +0000441 if (wname == NULL)
442 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000443 if (size <= len) {
Victor Stinnerf4061da2010-10-14 12:37:19 +0000444 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000445 return NULL;
446 }
Victor Stinnerf4061da2010-10-14 12:37:19 +0000447 wcsncpy(buf, wname, size);
448 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000449 return buf;
450#endif
451}
452
453#endif