blob: c563eaa5fbb7438991509e4d78fd28b7e6486f85 [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
6#ifdef HAVE_STAT
7
8/* Decode a byte string from the locale encoding with the
9 surrogateescape error handler (undecodable bytes are decoded as characters
10 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
11 character, escape the bytes using the surrogateescape error handler instead
12 of decoding them.
13
14 Use _Py_wchar2char() to encode the character string back to a byte string.
15
Victor Stinner168e1172010-10-16 23:16:16 +000016 Return a pointer to a newly allocated wide character string (use
17 PyMem_Free() to free the memory) and write the number of written wide
18 characters excluding the null character into *size if size is not NULL, or
Victor Stinner19de4c32010-11-08 23:30:46 +000019 NULL on error (conversion or memory allocation error).
20
21 Conversion errors should never happen, unless there is a bug in the C
22 library. */
Victor Stinner4e314432010-10-07 21:45:39 +000023wchar_t*
Victor Stinner168e1172010-10-16 23:16:16 +000024_Py_char2wchar(const char* arg, size_t *size)
Victor Stinner4e314432010-10-07 21:45:39 +000025{
26 wchar_t *res;
27#ifdef HAVE_BROKEN_MBSTOWCS
28 /* Some platforms have a broken implementation of
29 * mbstowcs which does not count the characters that
30 * would result from conversion. Use an upper bound.
31 */
32 size_t argsize = strlen(arg);
33#else
34 size_t argsize = mbstowcs(NULL, arg, 0);
35#endif
36 size_t count;
37 unsigned char *in;
38 wchar_t *out;
39#ifdef HAVE_MBRTOWC
40 mbstate_t mbs;
41#endif
42 if (argsize != (size_t)-1) {
43 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
44 if (!res)
45 goto oom;
46 count = mbstowcs(res, arg, argsize+1);
47 if (count != (size_t)-1) {
48 wchar_t *tmp;
49 /* Only use the result if it contains no
50 surrogate characters. */
51 for (tmp = res; *tmp != 0 &&
52 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
53 ;
Victor Stinner168e1172010-10-16 23:16:16 +000054 if (*tmp == 0) {
55 if (size != NULL)
56 *size = count;
Victor Stinner4e314432010-10-07 21:45:39 +000057 return res;
Victor Stinner168e1172010-10-16 23:16:16 +000058 }
Victor Stinner4e314432010-10-07 21:45:39 +000059 }
60 PyMem_Free(res);
61 }
62 /* Conversion failed. Fall back to escaping with surrogateescape. */
63#ifdef HAVE_MBRTOWC
64 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
65
66 /* Overallocate; as multi-byte characters are in the argument, the
67 actual output could use less memory. */
68 argsize = strlen(arg) + 1;
69 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
Victor Stinner19de4c32010-11-08 23:30:46 +000070 if (!res)
71 goto oom;
Victor Stinner4e314432010-10-07 21:45:39 +000072 in = (unsigned char*)arg;
73 out = res;
74 memset(&mbs, 0, sizeof mbs);
75 while (argsize) {
76 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
77 if (converted == 0)
78 /* Reached end of string; null char stored. */
79 break;
80 if (converted == (size_t)-2) {
81 /* Incomplete character. This should never happen,
82 since we provide everything that we have -
83 unless there is a bug in the C library, or I
84 misunderstood how mbrtowc works. */
85 fprintf(stderr, "unexpected mbrtowc result -2\n");
Victor Stinner19de4c32010-11-08 23:30:46 +000086 PyMem_Free(res);
Victor Stinner4e314432010-10-07 21:45:39 +000087 return NULL;
88 }
89 if (converted == (size_t)-1) {
90 /* Conversion error. Escape as UTF-8b, and start over
91 in the initial shift state. */
92 *out++ = 0xdc00 + *in++;
93 argsize--;
94 memset(&mbs, 0, sizeof mbs);
95 continue;
96 }
97 if (*out >= 0xd800 && *out <= 0xdfff) {
98 /* Surrogate character. Escape the original
99 byte sequence with surrogateescape. */
100 argsize -= converted;
101 while (converted--)
102 *out++ = 0xdc00 + *in++;
103 continue;
104 }
105 /* successfully converted some bytes */
106 in += converted;
107 argsize -= converted;
108 out++;
109 }
110#else
111 /* Cannot use C locale for escaping; manually escape as if charset
112 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
113 correctly in the locale's charset, which must be an ASCII superset. */
114 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
115 if (!res) goto oom;
116 in = (unsigned char*)arg;
117 out = res;
118 while(*in)
119 if(*in < 128)
120 *out++ = *in++;
121 else
122 *out++ = 0xdc00 + *in++;
123 *out = 0;
124#endif
Victor Stinner168e1172010-10-16 23:16:16 +0000125 if (size != NULL)
126 *size = out - res;
Victor Stinner4e314432010-10-07 21:45:39 +0000127 return res;
128oom:
129 fprintf(stderr, "out of memory\n");
130 return NULL;
131}
132
133/* Encode a (wide) character string to the locale encoding with the
134 surrogateescape error handler (characters in range U+DC80..U+DCFF are
135 converted to bytes 0x80..0xFF).
136
137 This function is the reverse of _Py_char2wchar().
138
139 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
Victor Stinner2f02a512010-11-08 22:43:46 +0000140 the memory), or NULL on conversion or memory allocation error.
141
142 If error_pos is not NULL: *error_pos is the index of the invalid character
143 on conversion error, or (size_t)-1 otherwise. */
Victor Stinner4e314432010-10-07 21:45:39 +0000144char*
Victor Stinner2f02a512010-11-08 22:43:46 +0000145_Py_wchar2char(const wchar_t *text, size_t *error_pos)
Victor Stinner4e314432010-10-07 21:45:39 +0000146{
147 const size_t len = wcslen(text);
148 char *result = NULL, *bytes = NULL;
149 size_t i, size, converted;
150 wchar_t c, buf[2];
151
Victor Stinner2f02a512010-11-08 22:43:46 +0000152 if (error_pos != NULL)
153 *error_pos = (size_t)-1;
154
Victor Stinner4e314432010-10-07 21:45:39 +0000155 /* The function works in two steps:
156 1. compute the length of the output buffer in bytes (size)
157 2. outputs the bytes */
158 size = 0;
159 buf[1] = 0;
160 while (1) {
161 for (i=0; i < len; i++) {
162 c = text[i];
163 if (c >= 0xdc80 && c <= 0xdcff) {
164 /* UTF-8b surrogate */
165 if (bytes != NULL) {
166 *bytes++ = c - 0xdc00;
167 size--;
168 }
169 else
170 size++;
171 continue;
172 }
173 else {
174 buf[0] = c;
175 if (bytes != NULL)
176 converted = wcstombs(bytes, buf, size);
177 else
178 converted = wcstombs(NULL, buf, 0);
179 if (converted == (size_t)-1) {
180 if (result != NULL)
181 PyMem_Free(result);
Victor Stinner2f02a512010-11-08 22:43:46 +0000182 if (error_pos != NULL)
183 *error_pos = i;
Victor Stinner4e314432010-10-07 21:45:39 +0000184 return NULL;
185 }
186 if (bytes != NULL) {
187 bytes += converted;
188 size -= converted;
189 }
190 else
191 size += converted;
192 }
193 }
194 if (result != NULL) {
195 *bytes = 0;
196 break;
197 }
198
199 size += 1; /* nul byte at the end */
200 result = PyMem_Malloc(size);
201 if (result == NULL)
202 return NULL;
203 bytes = result;
204 }
205 return result;
206}
207
Victor Stinner4e314432010-10-07 21:45:39 +0000208/* In principle, this should use HAVE__WSTAT, and _wstat
209 should be detected by autoconf. However, no current
210 POSIX system provides that function, so testing for
211 it is pointless.
212 Not sure whether the MS_WINDOWS guards are necessary:
213 perhaps for cygwin/mingw builds?
214*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000215#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000216
217/* Get file status. Encode the path to the locale encoding. */
218
Victor Stinnerb306d752010-10-07 22:09:40 +0000219int
220_Py_wstat(const wchar_t* path, struct stat *buf)
221{
Victor Stinner4e314432010-10-07 21:45:39 +0000222 int err;
223 char *fname;
Victor Stinner2f02a512010-11-08 22:43:46 +0000224 fname = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000225 if (fname == NULL) {
226 errno = EINVAL;
227 return -1;
228 }
229 err = stat(fname, buf);
230 PyMem_Free(fname);
231 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000232}
233#endif
234
Victor Stinner6672d0c2010-10-07 22:53:43 +0000235/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
236 call stat() otherwise. Only fill st_mode attribute on Windows.
237
238 Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
239 unicode error. */
Victor Stinner4e314432010-10-07 21:45:39 +0000240
241int
Victor Stinnera4a75952010-10-07 22:23:10 +0000242_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000243{
244#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000245 int err;
246 struct _stat wstatbuf;
247
Victor Stinnera4a75952010-10-07 22:23:10 +0000248 err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000249 if (!err)
250 statbuf->st_mode = wstatbuf.st_mode;
251 return err;
252#else
253 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000254 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000255 if (bytes == NULL)
256 return -1;
257 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
258 Py_DECREF(bytes);
259 return ret;
260#endif
261}
262
Victor Stinner6672d0c2010-10-07 22:53:43 +0000263/* Open a file. Use _wfopen() on Windows, encode the path to the locale
264 encoding and use fopen() otherwise. */
265
Victor Stinner4e314432010-10-07 21:45:39 +0000266FILE *
267_Py_wfopen(const wchar_t *path, const wchar_t *mode)
268{
269#ifndef MS_WINDOWS
270 FILE *f;
271 char *cpath;
272 char cmode[10];
273 size_t r;
274 r = wcstombs(cmode, mode, 10);
275 if (r == (size_t)-1 || r >= 10) {
276 errno = EINVAL;
277 return NULL;
278 }
Victor Stinner2f02a512010-11-08 22:43:46 +0000279 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000280 if (cpath == NULL)
281 return NULL;
282 f = fopen(cpath, cmode);
283 PyMem_Free(cpath);
284 return f;
285#else
286 return _wfopen(path, mode);
287#endif
288}
289
Victor Stinner6672d0c2010-10-07 22:53:43 +0000290/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
291 call fopen() otherwise.
292
293 Return the new file object on success, or NULL if the file cannot be open or
294 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000295
296FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000297_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000298{
299#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000300 wchar_t wmode[10];
301 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000302
303 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
304 if (usize == 0)
305 return NULL;
306
Victor Stinnera4a75952010-10-07 22:23:10 +0000307 return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000308#else
309 FILE *f;
Victor Stinnera4a75952010-10-07 22:23:10 +0000310 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000311 if (bytes == NULL)
312 return NULL;
313 f = fopen(PyBytes_AS_STRING(bytes), mode);
314 Py_DECREF(bytes);
315 return f;
316#endif
317}
318
319#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000320
321/* Read value of symbolic link. Encode the path to the locale encoding, decode
322 the result from the locale encoding. */
323
Victor Stinner4e314432010-10-07 21:45:39 +0000324int
325_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
326{
327 char *cpath;
328 char cbuf[PATH_MAX];
Victor Stinner3f711f42010-10-16 22:47:37 +0000329 wchar_t *wbuf;
Victor Stinner4e314432010-10-07 21:45:39 +0000330 int res;
331 size_t r1;
332
Victor Stinner2f02a512010-11-08 22:43:46 +0000333 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000334 if (cpath == NULL) {
335 errno = EINVAL;
336 return -1;
337 }
338 res = (int)readlink(cpath, cbuf, PATH_MAX);
339 PyMem_Free(cpath);
340 if (res == -1)
341 return -1;
342 if (res == PATH_MAX) {
343 errno = EINVAL;
344 return -1;
345 }
346 cbuf[res] = '\0'; /* buf will be null terminated */
Victor Stinner168e1172010-10-16 23:16:16 +0000347 wbuf = _Py_char2wchar(cbuf, &r1);
Victor Stinner350147b2010-10-16 22:52:09 +0000348 if (wbuf == NULL) {
349 errno = EINVAL;
350 return -1;
351 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000352 if (bufsiz <= r1) {
353 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000354 errno = EINVAL;
355 return -1;
356 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000357 wcsncpy(buf, wbuf, bufsiz);
358 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000359 return (int)r1;
360}
361#endif
362
363#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000364
365/* Return the canonicalized absolute pathname. Encode path to the locale
366 encoding, decode the result from the locale encoding. */
367
Victor Stinner4e314432010-10-07 21:45:39 +0000368wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000369_Py_wrealpath(const wchar_t *path,
370 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000371{
372 char *cpath;
373 char cresolved_path[PATH_MAX];
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000374 wchar_t *wresolved_path;
Victor Stinner4e314432010-10-07 21:45:39 +0000375 char *res;
376 size_t r;
Victor Stinner2f02a512010-11-08 22:43:46 +0000377 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000378 if (cpath == NULL) {
379 errno = EINVAL;
380 return NULL;
381 }
382 res = realpath(cpath, cresolved_path);
383 PyMem_Free(cpath);
384 if (res == NULL)
385 return NULL;
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000386
Victor Stinner168e1172010-10-16 23:16:16 +0000387 wresolved_path = _Py_char2wchar(cresolved_path, &r);
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000388 if (wresolved_path == NULL) {
Victor Stinner4e314432010-10-07 21:45:39 +0000389 errno = EINVAL;
390 return NULL;
391 }
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000392 if (resolved_path_size <= r) {
393 PyMem_Free(wresolved_path);
394 errno = EINVAL;
395 return NULL;
396 }
397 wcsncpy(resolved_path, wresolved_path, resolved_path_size);
398 PyMem_Free(wresolved_path);
Victor Stinner4e314432010-10-07 21:45:39 +0000399 return resolved_path;
400}
401#endif
402
Victor Stinnerf4061da2010-10-14 12:37:19 +0000403/* Get the current directory. size is the buffer size in wide characters
404 including the null character. Decode the path from the locale encoding. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000405
Victor Stinner4e314432010-10-07 21:45:39 +0000406wchar_t*
407_Py_wgetcwd(wchar_t *buf, size_t size)
408{
409#ifdef MS_WINDOWS
410 return _wgetcwd(buf, size);
411#else
412 char fname[PATH_MAX];
Victor Stinnerf4061da2010-10-14 12:37:19 +0000413 wchar_t *wname;
Victor Stinner168e1172010-10-16 23:16:16 +0000414 size_t len;
Victor Stinnerf4061da2010-10-14 12:37:19 +0000415
Victor Stinner4e314432010-10-07 21:45:39 +0000416 if (getcwd(fname, PATH_MAX) == NULL)
417 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000418 wname = _Py_char2wchar(fname, &len);
Victor Stinnerf4061da2010-10-14 12:37:19 +0000419 if (wname == NULL)
420 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000421 if (size <= len) {
Victor Stinnerf4061da2010-10-14 12:37:19 +0000422 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000423 return NULL;
424 }
Victor Stinnerf4061da2010-10-14 12:37:19 +0000425 wcsncpy(buf, wname, size);
426 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000427 return buf;
428#endif
429}
430
431#endif