blob: 18e98e513c6298e034f28544de66f3853ce80270 [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
6#ifdef HAVE_STAT
7
8/* Decode a byte string from the locale encoding with the
9 surrogateescape error handler (undecodable bytes are decoded as characters
10 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
11 character, escape the bytes using the surrogateescape error handler instead
12 of decoding them.
13
14 Use _Py_wchar2char() to encode the character string back to a byte string.
15
Victor Stinner168e1172010-10-16 23:16:16 +000016 Return a pointer to a newly allocated wide character string (use
17 PyMem_Free() to free the memory) and write the number of written wide
18 characters excluding the null character into *size if size is not NULL, or
19 NULL on error (conversion error or memory error). */
Victor Stinner4e314432010-10-07 21:45:39 +000020wchar_t*
Victor Stinner168e1172010-10-16 23:16:16 +000021_Py_char2wchar(const char* arg, size_t *size)
Victor Stinner4e314432010-10-07 21:45:39 +000022{
23 wchar_t *res;
24#ifdef HAVE_BROKEN_MBSTOWCS
25 /* Some platforms have a broken implementation of
26 * mbstowcs which does not count the characters that
27 * would result from conversion. Use an upper bound.
28 */
29 size_t argsize = strlen(arg);
30#else
31 size_t argsize = mbstowcs(NULL, arg, 0);
32#endif
33 size_t count;
34 unsigned char *in;
35 wchar_t *out;
36#ifdef HAVE_MBRTOWC
37 mbstate_t mbs;
38#endif
39 if (argsize != (size_t)-1) {
40 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
41 if (!res)
42 goto oom;
43 count = mbstowcs(res, arg, argsize+1);
44 if (count != (size_t)-1) {
45 wchar_t *tmp;
46 /* Only use the result if it contains no
47 surrogate characters. */
48 for (tmp = res; *tmp != 0 &&
49 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
50 ;
Victor Stinner168e1172010-10-16 23:16:16 +000051 if (*tmp == 0) {
52 if (size != NULL)
53 *size = count;
Victor Stinner4e314432010-10-07 21:45:39 +000054 return res;
Victor Stinner168e1172010-10-16 23:16:16 +000055 }
Victor Stinner4e314432010-10-07 21:45:39 +000056 }
57 PyMem_Free(res);
58 }
59 /* Conversion failed. Fall back to escaping with surrogateescape. */
60#ifdef HAVE_MBRTOWC
61 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
62
63 /* Overallocate; as multi-byte characters are in the argument, the
64 actual output could use less memory. */
65 argsize = strlen(arg) + 1;
66 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
67 if (!res) goto oom;
68 in = (unsigned char*)arg;
69 out = res;
70 memset(&mbs, 0, sizeof mbs);
71 while (argsize) {
72 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
73 if (converted == 0)
74 /* Reached end of string; null char stored. */
75 break;
76 if (converted == (size_t)-2) {
77 /* Incomplete character. This should never happen,
78 since we provide everything that we have -
79 unless there is a bug in the C library, or I
80 misunderstood how mbrtowc works. */
81 fprintf(stderr, "unexpected mbrtowc result -2\n");
82 return NULL;
83 }
84 if (converted == (size_t)-1) {
85 /* Conversion error. Escape as UTF-8b, and start over
86 in the initial shift state. */
87 *out++ = 0xdc00 + *in++;
88 argsize--;
89 memset(&mbs, 0, sizeof mbs);
90 continue;
91 }
92 if (*out >= 0xd800 && *out <= 0xdfff) {
93 /* Surrogate character. Escape the original
94 byte sequence with surrogateescape. */
95 argsize -= converted;
96 while (converted--)
97 *out++ = 0xdc00 + *in++;
98 continue;
99 }
100 /* successfully converted some bytes */
101 in += converted;
102 argsize -= converted;
103 out++;
104 }
105#else
106 /* Cannot use C locale for escaping; manually escape as if charset
107 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
108 correctly in the locale's charset, which must be an ASCII superset. */
109 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
110 if (!res) goto oom;
111 in = (unsigned char*)arg;
112 out = res;
113 while(*in)
114 if(*in < 128)
115 *out++ = *in++;
116 else
117 *out++ = 0xdc00 + *in++;
118 *out = 0;
119#endif
Victor Stinner168e1172010-10-16 23:16:16 +0000120 if (size != NULL)
121 *size = out - res;
Victor Stinner4e314432010-10-07 21:45:39 +0000122 return res;
123oom:
124 fprintf(stderr, "out of memory\n");
125 return NULL;
126}
127
128/* Encode a (wide) character string to the locale encoding with the
129 surrogateescape error handler (characters in range U+DC80..U+DCFF are
130 converted to bytes 0x80..0xFF).
131
132 This function is the reverse of _Py_char2wchar().
133
134 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
Victor Stinner2f02a512010-11-08 22:43:46 +0000135 the memory), or NULL on conversion or memory allocation error.
136
137 If error_pos is not NULL: *error_pos is the index of the invalid character
138 on conversion error, or (size_t)-1 otherwise. */
Victor Stinner4e314432010-10-07 21:45:39 +0000139char*
Victor Stinner2f02a512010-11-08 22:43:46 +0000140_Py_wchar2char(const wchar_t *text, size_t *error_pos)
Victor Stinner4e314432010-10-07 21:45:39 +0000141{
142 const size_t len = wcslen(text);
143 char *result = NULL, *bytes = NULL;
144 size_t i, size, converted;
145 wchar_t c, buf[2];
146
Victor Stinner2f02a512010-11-08 22:43:46 +0000147 if (error_pos != NULL)
148 *error_pos = (size_t)-1;
149
Victor Stinner4e314432010-10-07 21:45:39 +0000150 /* The function works in two steps:
151 1. compute the length of the output buffer in bytes (size)
152 2. outputs the bytes */
153 size = 0;
154 buf[1] = 0;
155 while (1) {
156 for (i=0; i < len; i++) {
157 c = text[i];
158 if (c >= 0xdc80 && c <= 0xdcff) {
159 /* UTF-8b surrogate */
160 if (bytes != NULL) {
161 *bytes++ = c - 0xdc00;
162 size--;
163 }
164 else
165 size++;
166 continue;
167 }
168 else {
169 buf[0] = c;
170 if (bytes != NULL)
171 converted = wcstombs(bytes, buf, size);
172 else
173 converted = wcstombs(NULL, buf, 0);
174 if (converted == (size_t)-1) {
175 if (result != NULL)
176 PyMem_Free(result);
Victor Stinner2f02a512010-11-08 22:43:46 +0000177 if (error_pos != NULL)
178 *error_pos = i;
Victor Stinner4e314432010-10-07 21:45:39 +0000179 return NULL;
180 }
181 if (bytes != NULL) {
182 bytes += converted;
183 size -= converted;
184 }
185 else
186 size += converted;
187 }
188 }
189 if (result != NULL) {
190 *bytes = 0;
191 break;
192 }
193
194 size += 1; /* nul byte at the end */
195 result = PyMem_Malloc(size);
196 if (result == NULL)
197 return NULL;
198 bytes = result;
199 }
200 return result;
201}
202
Victor Stinner4e314432010-10-07 21:45:39 +0000203/* In principle, this should use HAVE__WSTAT, and _wstat
204 should be detected by autoconf. However, no current
205 POSIX system provides that function, so testing for
206 it is pointless.
207 Not sure whether the MS_WINDOWS guards are necessary:
208 perhaps for cygwin/mingw builds?
209*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000210#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000211
212/* Get file status. Encode the path to the locale encoding. */
213
Victor Stinnerb306d752010-10-07 22:09:40 +0000214int
215_Py_wstat(const wchar_t* path, struct stat *buf)
216{
Victor Stinner4e314432010-10-07 21:45:39 +0000217 int err;
218 char *fname;
Victor Stinner2f02a512010-11-08 22:43:46 +0000219 fname = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000220 if (fname == NULL) {
221 errno = EINVAL;
222 return -1;
223 }
224 err = stat(fname, buf);
225 PyMem_Free(fname);
226 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000227}
228#endif
229
Victor Stinner6672d0c2010-10-07 22:53:43 +0000230/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
231 call stat() otherwise. Only fill st_mode attribute on Windows.
232
233 Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
234 unicode error. */
Victor Stinner4e314432010-10-07 21:45:39 +0000235
236int
Victor Stinnera4a75952010-10-07 22:23:10 +0000237_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000238{
239#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000240 int err;
241 struct _stat wstatbuf;
242
Victor Stinnera4a75952010-10-07 22:23:10 +0000243 err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000244 if (!err)
245 statbuf->st_mode = wstatbuf.st_mode;
246 return err;
247#else
248 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000249 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000250 if (bytes == NULL)
251 return -1;
252 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
253 Py_DECREF(bytes);
254 return ret;
255#endif
256}
257
Victor Stinner6672d0c2010-10-07 22:53:43 +0000258/* Open a file. Use _wfopen() on Windows, encode the path to the locale
259 encoding and use fopen() otherwise. */
260
Victor Stinner4e314432010-10-07 21:45:39 +0000261FILE *
262_Py_wfopen(const wchar_t *path, const wchar_t *mode)
263{
264#ifndef MS_WINDOWS
265 FILE *f;
266 char *cpath;
267 char cmode[10];
268 size_t r;
269 r = wcstombs(cmode, mode, 10);
270 if (r == (size_t)-1 || r >= 10) {
271 errno = EINVAL;
272 return NULL;
273 }
Victor Stinner2f02a512010-11-08 22:43:46 +0000274 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000275 if (cpath == NULL)
276 return NULL;
277 f = fopen(cpath, cmode);
278 PyMem_Free(cpath);
279 return f;
280#else
281 return _wfopen(path, mode);
282#endif
283}
284
Victor Stinner6672d0c2010-10-07 22:53:43 +0000285/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
286 call fopen() otherwise.
287
288 Return the new file object on success, or NULL if the file cannot be open or
289 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000290
291FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000292_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000293{
294#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000295 wchar_t wmode[10];
296 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000297
298 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
299 if (usize == 0)
300 return NULL;
301
Victor Stinnera4a75952010-10-07 22:23:10 +0000302 return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000303#else
304 FILE *f;
Victor Stinnera4a75952010-10-07 22:23:10 +0000305 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000306 if (bytes == NULL)
307 return NULL;
308 f = fopen(PyBytes_AS_STRING(bytes), mode);
309 Py_DECREF(bytes);
310 return f;
311#endif
312}
313
314#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000315
316/* Read value of symbolic link. Encode the path to the locale encoding, decode
317 the result from the locale encoding. */
318
Victor Stinner4e314432010-10-07 21:45:39 +0000319int
320_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
321{
322 char *cpath;
323 char cbuf[PATH_MAX];
Victor Stinner3f711f42010-10-16 22:47:37 +0000324 wchar_t *wbuf;
Victor Stinner4e314432010-10-07 21:45:39 +0000325 int res;
326 size_t r1;
327
Victor Stinner2f02a512010-11-08 22:43:46 +0000328 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000329 if (cpath == NULL) {
330 errno = EINVAL;
331 return -1;
332 }
333 res = (int)readlink(cpath, cbuf, PATH_MAX);
334 PyMem_Free(cpath);
335 if (res == -1)
336 return -1;
337 if (res == PATH_MAX) {
338 errno = EINVAL;
339 return -1;
340 }
341 cbuf[res] = '\0'; /* buf will be null terminated */
Victor Stinner168e1172010-10-16 23:16:16 +0000342 wbuf = _Py_char2wchar(cbuf, &r1);
Victor Stinner350147b2010-10-16 22:52:09 +0000343 if (wbuf == NULL) {
344 errno = EINVAL;
345 return -1;
346 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000347 if (bufsiz <= r1) {
348 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000349 errno = EINVAL;
350 return -1;
351 }
Victor Stinner3f711f42010-10-16 22:47:37 +0000352 wcsncpy(buf, wbuf, bufsiz);
353 PyMem_Free(wbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000354 return (int)r1;
355}
356#endif
357
358#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000359
360/* Return the canonicalized absolute pathname. Encode path to the locale
361 encoding, decode the result from the locale encoding. */
362
Victor Stinner4e314432010-10-07 21:45:39 +0000363wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000364_Py_wrealpath(const wchar_t *path,
365 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000366{
367 char *cpath;
368 char cresolved_path[PATH_MAX];
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000369 wchar_t *wresolved_path;
Victor Stinner4e314432010-10-07 21:45:39 +0000370 char *res;
371 size_t r;
Victor Stinner2f02a512010-11-08 22:43:46 +0000372 cpath = _Py_wchar2char(path, NULL);
Victor Stinner4e314432010-10-07 21:45:39 +0000373 if (cpath == NULL) {
374 errno = EINVAL;
375 return NULL;
376 }
377 res = realpath(cpath, cresolved_path);
378 PyMem_Free(cpath);
379 if (res == NULL)
380 return NULL;
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000381
Victor Stinner168e1172010-10-16 23:16:16 +0000382 wresolved_path = _Py_char2wchar(cresolved_path, &r);
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000383 if (wresolved_path == NULL) {
Victor Stinner4e314432010-10-07 21:45:39 +0000384 errno = EINVAL;
385 return NULL;
386 }
Victor Stinner0a1b8cb2010-10-16 22:55:47 +0000387 if (resolved_path_size <= r) {
388 PyMem_Free(wresolved_path);
389 errno = EINVAL;
390 return NULL;
391 }
392 wcsncpy(resolved_path, wresolved_path, resolved_path_size);
393 PyMem_Free(wresolved_path);
Victor Stinner4e314432010-10-07 21:45:39 +0000394 return resolved_path;
395}
396#endif
397
Victor Stinnerf4061da2010-10-14 12:37:19 +0000398/* Get the current directory. size is the buffer size in wide characters
399 including the null character. Decode the path from the locale encoding. */
Victor Stinner6672d0c2010-10-07 22:53:43 +0000400
Victor Stinner4e314432010-10-07 21:45:39 +0000401wchar_t*
402_Py_wgetcwd(wchar_t *buf, size_t size)
403{
404#ifdef MS_WINDOWS
405 return _wgetcwd(buf, size);
406#else
407 char fname[PATH_MAX];
Victor Stinnerf4061da2010-10-14 12:37:19 +0000408 wchar_t *wname;
Victor Stinner168e1172010-10-16 23:16:16 +0000409 size_t len;
Victor Stinnerf4061da2010-10-14 12:37:19 +0000410
Victor Stinner4e314432010-10-07 21:45:39 +0000411 if (getcwd(fname, PATH_MAX) == NULL)
412 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000413 wname = _Py_char2wchar(fname, &len);
Victor Stinnerf4061da2010-10-14 12:37:19 +0000414 if (wname == NULL)
415 return NULL;
Victor Stinner168e1172010-10-16 23:16:16 +0000416 if (size <= len) {
Victor Stinnerf4061da2010-10-14 12:37:19 +0000417 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000418 return NULL;
419 }
Victor Stinnerf4061da2010-10-14 12:37:19 +0000420 wcsncpy(buf, wname, size);
421 PyMem_Free(wname);
Victor Stinner4e314432010-10-07 21:45:39 +0000422 return buf;
423#endif
424}
425
426#endif