blob: 9423cb02f1b9b9cef609bb346fca48ab9b75b90a [file] [log] [blame]
Victor Stinner4e314432010-10-07 21:45:39 +00001#include "Python.h"
Victor Stinnerb306d752010-10-07 22:09:40 +00002#ifdef MS_WINDOWS
3# include <windows.h>
4#endif
Victor Stinner4e314432010-10-07 21:45:39 +00005
6#ifdef HAVE_STAT
7
8/* Decode a byte string from the locale encoding with the
9 surrogateescape error handler (undecodable bytes are decoded as characters
10 in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
11 character, escape the bytes using the surrogateescape error handler instead
12 of decoding them.
13
14 Use _Py_wchar2char() to encode the character string back to a byte string.
15
16 Return a pointer to a newly allocated (wide) character string (use
17 PyMem_Free() to free the memory), or NULL on error (conversion error or
18 memory error). */
19wchar_t*
20_Py_char2wchar(char* arg)
21{
22 wchar_t *res;
23#ifdef HAVE_BROKEN_MBSTOWCS
24 /* Some platforms have a broken implementation of
25 * mbstowcs which does not count the characters that
26 * would result from conversion. Use an upper bound.
27 */
28 size_t argsize = strlen(arg);
29#else
30 size_t argsize = mbstowcs(NULL, arg, 0);
31#endif
32 size_t count;
33 unsigned char *in;
34 wchar_t *out;
35#ifdef HAVE_MBRTOWC
36 mbstate_t mbs;
37#endif
38 if (argsize != (size_t)-1) {
39 res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
40 if (!res)
41 goto oom;
42 count = mbstowcs(res, arg, argsize+1);
43 if (count != (size_t)-1) {
44 wchar_t *tmp;
45 /* Only use the result if it contains no
46 surrogate characters. */
47 for (tmp = res; *tmp != 0 &&
48 (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
49 ;
50 if (*tmp == 0)
51 return res;
52 }
53 PyMem_Free(res);
54 }
55 /* Conversion failed. Fall back to escaping with surrogateescape. */
56#ifdef HAVE_MBRTOWC
57 /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */
58
59 /* Overallocate; as multi-byte characters are in the argument, the
60 actual output could use less memory. */
61 argsize = strlen(arg) + 1;
62 res = (wchar_t*)PyMem_Malloc(argsize*sizeof(wchar_t));
63 if (!res) goto oom;
64 in = (unsigned char*)arg;
65 out = res;
66 memset(&mbs, 0, sizeof mbs);
67 while (argsize) {
68 size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
69 if (converted == 0)
70 /* Reached end of string; null char stored. */
71 break;
72 if (converted == (size_t)-2) {
73 /* Incomplete character. This should never happen,
74 since we provide everything that we have -
75 unless there is a bug in the C library, or I
76 misunderstood how mbrtowc works. */
77 fprintf(stderr, "unexpected mbrtowc result -2\n");
78 return NULL;
79 }
80 if (converted == (size_t)-1) {
81 /* Conversion error. Escape as UTF-8b, and start over
82 in the initial shift state. */
83 *out++ = 0xdc00 + *in++;
84 argsize--;
85 memset(&mbs, 0, sizeof mbs);
86 continue;
87 }
88 if (*out >= 0xd800 && *out <= 0xdfff) {
89 /* Surrogate character. Escape the original
90 byte sequence with surrogateescape. */
91 argsize -= converted;
92 while (converted--)
93 *out++ = 0xdc00 + *in++;
94 continue;
95 }
96 /* successfully converted some bytes */
97 in += converted;
98 argsize -= converted;
99 out++;
100 }
101#else
102 /* Cannot use C locale for escaping; manually escape as if charset
103 is ASCII (i.e. escape all bytes > 128. This will still roundtrip
104 correctly in the locale's charset, which must be an ASCII superset. */
105 res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
106 if (!res) goto oom;
107 in = (unsigned char*)arg;
108 out = res;
109 while(*in)
110 if(*in < 128)
111 *out++ = *in++;
112 else
113 *out++ = 0xdc00 + *in++;
114 *out = 0;
115#endif
116 return res;
117oom:
118 fprintf(stderr, "out of memory\n");
119 return NULL;
120}
121
122/* Encode a (wide) character string to the locale encoding with the
123 surrogateescape error handler (characters in range U+DC80..U+DCFF are
124 converted to bytes 0x80..0xFF).
125
126 This function is the reverse of _Py_char2wchar().
127
128 Return a pointer to a newly allocated byte string (use PyMem_Free() to free
129 the memory), or NULL on error (conversion error or memory error). */
130char*
131_Py_wchar2char(const wchar_t *text)
132{
133 const size_t len = wcslen(text);
134 char *result = NULL, *bytes = NULL;
135 size_t i, size, converted;
136 wchar_t c, buf[2];
137
138 /* The function works in two steps:
139 1. compute the length of the output buffer in bytes (size)
140 2. outputs the bytes */
141 size = 0;
142 buf[1] = 0;
143 while (1) {
144 for (i=0; i < len; i++) {
145 c = text[i];
146 if (c >= 0xdc80 && c <= 0xdcff) {
147 /* UTF-8b surrogate */
148 if (bytes != NULL) {
149 *bytes++ = c - 0xdc00;
150 size--;
151 }
152 else
153 size++;
154 continue;
155 }
156 else {
157 buf[0] = c;
158 if (bytes != NULL)
159 converted = wcstombs(bytes, buf, size);
160 else
161 converted = wcstombs(NULL, buf, 0);
162 if (converted == (size_t)-1) {
163 if (result != NULL)
164 PyMem_Free(result);
165 return NULL;
166 }
167 if (bytes != NULL) {
168 bytes += converted;
169 size -= converted;
170 }
171 else
172 size += converted;
173 }
174 }
175 if (result != NULL) {
176 *bytes = 0;
177 break;
178 }
179
180 size += 1; /* nul byte at the end */
181 result = PyMem_Malloc(size);
182 if (result == NULL)
183 return NULL;
184 bytes = result;
185 }
186 return result;
187}
188
Victor Stinner4e314432010-10-07 21:45:39 +0000189/* In principle, this should use HAVE__WSTAT, and _wstat
190 should be detected by autoconf. However, no current
191 POSIX system provides that function, so testing for
192 it is pointless.
193 Not sure whether the MS_WINDOWS guards are necessary:
194 perhaps for cygwin/mingw builds?
195*/
Victor Stinnerb306d752010-10-07 22:09:40 +0000196#if defined(HAVE_STAT) && !defined(MS_WINDOWS)
Victor Stinner6672d0c2010-10-07 22:53:43 +0000197
198/* Get file status. Encode the path to the locale encoding. */
199
Victor Stinnerb306d752010-10-07 22:09:40 +0000200int
201_Py_wstat(const wchar_t* path, struct stat *buf)
202{
Victor Stinner4e314432010-10-07 21:45:39 +0000203 int err;
204 char *fname;
205 fname = _Py_wchar2char(path);
206 if (fname == NULL) {
207 errno = EINVAL;
208 return -1;
209 }
210 err = stat(fname, buf);
211 PyMem_Free(fname);
212 return err;
Victor Stinner4e314432010-10-07 21:45:39 +0000213}
214#endif
215
Victor Stinner6672d0c2010-10-07 22:53:43 +0000216/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
217 call stat() otherwise. Only fill st_mode attribute on Windows.
218
219 Return 0 on success, -1 on _wstat() / stat() error or (if PyErr_Occurred())
220 unicode error. */
Victor Stinner4e314432010-10-07 21:45:39 +0000221
222int
Victor Stinnera4a75952010-10-07 22:23:10 +0000223_Py_stat(PyObject *path, struct stat *statbuf)
Victor Stinner4e314432010-10-07 21:45:39 +0000224{
225#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000226 int err;
227 struct _stat wstatbuf;
228
Victor Stinnera4a75952010-10-07 22:23:10 +0000229 err = _wstat(PyUnicode_AS_UNICODE(path), &wstatbuf);
Victor Stinner4e314432010-10-07 21:45:39 +0000230 if (!err)
231 statbuf->st_mode = wstatbuf.st_mode;
232 return err;
233#else
234 int ret;
Victor Stinnera4a75952010-10-07 22:23:10 +0000235 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000236 if (bytes == NULL)
237 return -1;
238 ret = stat(PyBytes_AS_STRING(bytes), statbuf);
239 Py_DECREF(bytes);
240 return ret;
241#endif
242}
243
Victor Stinner6672d0c2010-10-07 22:53:43 +0000244/* Open a file. Use _wfopen() on Windows, encode the path to the locale
245 encoding and use fopen() otherwise. */
246
Victor Stinner4e314432010-10-07 21:45:39 +0000247FILE *
248_Py_wfopen(const wchar_t *path, const wchar_t *mode)
249{
250#ifndef MS_WINDOWS
251 FILE *f;
252 char *cpath;
253 char cmode[10];
254 size_t r;
255 r = wcstombs(cmode, mode, 10);
256 if (r == (size_t)-1 || r >= 10) {
257 errno = EINVAL;
258 return NULL;
259 }
260 cpath = _Py_wchar2char(path);
261 if (cpath == NULL)
262 return NULL;
263 f = fopen(cpath, cmode);
264 PyMem_Free(cpath);
265 return f;
266#else
267 return _wfopen(path, mode);
268#endif
269}
270
Victor Stinner6672d0c2010-10-07 22:53:43 +0000271/* Call _wfopen() on Windows, or encode the path to the filesystem encoding and
272 call fopen() otherwise.
273
274 Return the new file object on success, or NULL if the file cannot be open or
275 (if PyErr_Occurred()) on unicode error */
Victor Stinner4e314432010-10-07 21:45:39 +0000276
277FILE*
Victor Stinnera4a75952010-10-07 22:23:10 +0000278_Py_fopen(PyObject *path, const char *mode)
Victor Stinner4e314432010-10-07 21:45:39 +0000279{
280#ifdef MS_WINDOWS
Victor Stinner4e314432010-10-07 21:45:39 +0000281 wchar_t wmode[10];
282 int usize;
Victor Stinner4e314432010-10-07 21:45:39 +0000283
284 usize = MultiByteToWideChar(CP_ACP, 0, mode, -1, wmode, sizeof(wmode));
285 if (usize == 0)
286 return NULL;
287
Victor Stinnera4a75952010-10-07 22:23:10 +0000288 return _wfopen(PyUnicode_AS_UNICODE(path), wmode);
Victor Stinner4e314432010-10-07 21:45:39 +0000289#else
290 FILE *f;
Victor Stinnera4a75952010-10-07 22:23:10 +0000291 PyObject *bytes = PyUnicode_EncodeFSDefault(path);
Victor Stinner4e314432010-10-07 21:45:39 +0000292 if (bytes == NULL)
293 return NULL;
294 f = fopen(PyBytes_AS_STRING(bytes), mode);
295 Py_DECREF(bytes);
296 return f;
297#endif
298}
299
300#ifdef HAVE_READLINK
Victor Stinner6672d0c2010-10-07 22:53:43 +0000301
302/* Read value of symbolic link. Encode the path to the locale encoding, decode
303 the result from the locale encoding. */
304
Victor Stinner4e314432010-10-07 21:45:39 +0000305int
306_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
307{
308 char *cpath;
309 char cbuf[PATH_MAX];
310 int res;
311 size_t r1;
312
313 cpath = _Py_wchar2char(path);
314 if (cpath == NULL) {
315 errno = EINVAL;
316 return -1;
317 }
318 res = (int)readlink(cpath, cbuf, PATH_MAX);
319 PyMem_Free(cpath);
320 if (res == -1)
321 return -1;
322 if (res == PATH_MAX) {
323 errno = EINVAL;
324 return -1;
325 }
326 cbuf[res] = '\0'; /* buf will be null terminated */
327 r1 = mbstowcs(buf, cbuf, bufsiz);
328 if (r1 == -1) {
329 errno = EINVAL;
330 return -1;
331 }
332 return (int)r1;
333}
334#endif
335
336#ifdef HAVE_REALPATH
Victor Stinner6672d0c2010-10-07 22:53:43 +0000337
338/* Return the canonicalized absolute pathname. Encode path to the locale
339 encoding, decode the result from the locale encoding. */
340
Victor Stinner4e314432010-10-07 21:45:39 +0000341wchar_t*
Victor Stinner015f4d82010-10-07 22:29:53 +0000342_Py_wrealpath(const wchar_t *path,
343 wchar_t *resolved_path, size_t resolved_path_size)
Victor Stinner4e314432010-10-07 21:45:39 +0000344{
345 char *cpath;
346 char cresolved_path[PATH_MAX];
347 char *res;
348 size_t r;
349 cpath = _Py_wchar2char(path);
350 if (cpath == NULL) {
351 errno = EINVAL;
352 return NULL;
353 }
354 res = realpath(cpath, cresolved_path);
355 PyMem_Free(cpath);
356 if (res == NULL)
357 return NULL;
Victor Stinner015f4d82010-10-07 22:29:53 +0000358 r = mbstowcs(resolved_path, cresolved_path, resolved_path_size);
Victor Stinner4e314432010-10-07 21:45:39 +0000359 if (r == (size_t)-1 || r >= PATH_MAX) {
360 errno = EINVAL;
361 return NULL;
362 }
363 return resolved_path;
364}
365#endif
366
Victor Stinner6672d0c2010-10-07 22:53:43 +0000367/* Get the current directory. Decode the path from the locale encoding. */
368
Victor Stinner4e314432010-10-07 21:45:39 +0000369wchar_t*
370_Py_wgetcwd(wchar_t *buf, size_t size)
371{
372#ifdef MS_WINDOWS
373 return _wgetcwd(buf, size);
374#else
375 char fname[PATH_MAX];
376 if (getcwd(fname, PATH_MAX) == NULL)
377 return NULL;
378 if (mbstowcs(buf, fname, size) >= size) {
379 errno = ERANGE;
380 return NULL;
381 }
382 return buf;
383#endif
384}
385
386#endif