blob: 3aaf98e841d9337e34290ef7334e2f75c1cf7e93 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000122 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000157 assert(length < INT_MAX);
158 unicode->length = (int)length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 return 0;
169}
170
171/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000172 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173
174 XXX This allocator could further be enhanced by assuring that the
175 free list never reduces its size below 1.
176
177*/
178
179static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000180PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181{
182 register PyUnicodeObject *unicode;
183
Tim Petersced69f82003-09-16 20:30:58 +0000184 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 if (length == 0 && unicode_empty != NULL) {
186 Py_INCREF(unicode_empty);
187 return unicode_empty;
188 }
189
190 /* Unicode freelist & memory allocation */
191 if (unicode_freelist) {
192 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000193 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000196 /* Keep-Alive optimization: we only upsize the buffer,
197 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000198 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000199 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000200 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000204 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000206 }
207 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 }
209 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000210 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 if (unicode == NULL)
212 return NULL;
213 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
214 }
215
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000216 if (!unicode->str) {
217 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000218 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000219 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000220 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000221 * the caller fails before initializing str -- unicode_resize()
222 * reads str[0], and the Keep-Alive optimization can keep memory
223 * allocated for str alive across a call to unicode_dealloc(unicode).
224 * We don't want unicode_resize to read uninitialized memory in
225 * that case.
226 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000227 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str[length] = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000229 assert(length<INT_MAX);
230 unicode->length = (int)length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000232 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000234
235 onError:
236 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000237 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239}
240
241static
Guido van Rossum9475a232001-10-05 20:51:39 +0000242void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000244 if (PyUnicode_CheckExact(unicode) &&
245 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000246 /* Keep-Alive optimization */
247 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 unicode->str = NULL;
250 unicode->length = 0;
251 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000252 if (unicode->defenc) {
253 Py_DECREF(unicode->defenc);
254 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 }
256 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 *(PyUnicodeObject **)unicode = unicode_freelist;
258 unicode_freelist = unicode;
259 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000262 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000263 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000264 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 }
266}
267
Martin v. Löwis18e16552006-02-15 17:27:45 +0000268int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269{
270 register PyUnicodeObject *v;
271
272 /* Argument checks */
273 if (unicode == NULL) {
274 PyErr_BadInternalCall();
275 return -1;
276 }
277 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000278 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279 PyErr_BadInternalCall();
280 return -1;
281 }
282
283 /* Resizing unicode_empty and single character objects is not
284 possible since these are being shared. We simply return a fresh
285 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000286 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000287 (v == unicode_empty || v->length == 1)) {
288 PyUnicodeObject *w = _PyUnicode_New(length);
289 if (w == NULL)
290 return -1;
291 Py_UNICODE_COPY(w->str, v->str,
292 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000293 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000294 *unicode = (PyObject *)w;
295 return 0;
296 }
297
298 /* Note that we don't have to modify *unicode for unshared Unicode
299 objects, since we can modify them in-place. */
300 return unicode_resize(v, length);
301}
302
303/* Internal API for use in unicodeobject.c only ! */
304#define _PyUnicode_Resize(unicodevar, length) \
305 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000308 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309{
310 PyUnicodeObject *unicode;
311
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 /* If the Unicode data is known at construction time, we can apply
313 some optimizations which share commonly used objects. */
314 if (u != NULL) {
315
316 /* Optimization for empty strings */
317 if (size == 0 && unicode_empty != NULL) {
318 Py_INCREF(unicode_empty);
319 return (PyObject *)unicode_empty;
320 }
321
322 /* Single character Unicode objects in the Latin-1 range are
323 shared when using this constructor */
324 if (size == 1 && *u < 256) {
325 unicode = unicode_latin1[*u];
326 if (!unicode) {
327 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 if (!unicode)
329 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000330 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000331 unicode_latin1[*u] = unicode;
332 }
333 Py_INCREF(unicode);
334 return (PyObject *)unicode;
335 }
336 }
Tim Petersced69f82003-09-16 20:30:58 +0000337
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 unicode = _PyUnicode_New(size);
339 if (!unicode)
340 return NULL;
341
342 /* Copy the Unicode data into the new object */
343 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000344 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345
346 return (PyObject *)unicode;
347}
348
349#ifdef HAVE_WCHAR_H
350
351PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000352 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353{
354 PyUnicodeObject *unicode;
355
356 if (w == NULL) {
357 PyErr_BadInternalCall();
358 return NULL;
359 }
360
361 unicode = _PyUnicode_New(size);
362 if (!unicode)
363 return NULL;
364
365 /* Copy the wchar_t data into the new object */
366#ifdef HAVE_USABLE_WCHAR_T
367 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000368#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 {
370 register Py_UNICODE *u;
371 register int i;
372 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000373 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 *u++ = *w++;
375 }
376#endif
377
378 return (PyObject *)unicode;
379}
380
Martin v. Löwis18e16552006-02-15 17:27:45 +0000381Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
382 wchar_t *w,
383 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384{
385 if (unicode == NULL) {
386 PyErr_BadInternalCall();
387 return -1;
388 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000389
390 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000392 size = PyUnicode_GET_SIZE(unicode) + 1;
393
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394#ifdef HAVE_USABLE_WCHAR_T
395 memcpy(w, unicode->str, size * sizeof(wchar_t));
396#else
397 {
398 register Py_UNICODE *u;
399 register int i;
400 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000401 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 *w++ = *u++;
403 }
404#endif
405
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000406 if (size > PyUnicode_GET_SIZE(unicode))
407 return PyUnicode_GET_SIZE(unicode);
408 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000409 return size;
410}
411
412#endif
413
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000414PyObject *PyUnicode_FromOrdinal(int ordinal)
415{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000416 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000417
418#ifdef Py_UNICODE_WIDE
419 if (ordinal < 0 || ordinal > 0x10ffff) {
420 PyErr_SetString(PyExc_ValueError,
421 "unichr() arg not in range(0x110000) "
422 "(wide Python build)");
423 return NULL;
424 }
425#else
426 if (ordinal < 0 || ordinal > 0xffff) {
427 PyErr_SetString(PyExc_ValueError,
428 "unichr() arg not in range(0x10000) "
429 "(narrow Python build)");
430 return NULL;
431 }
432#endif
433
Hye-Shik Chang40574832004-04-06 07:24:51 +0000434 s[0] = (Py_UNICODE)ordinal;
435 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000436}
437
Guido van Rossumd57fd912000-03-10 22:53:23 +0000438PyObject *PyUnicode_FromObject(register PyObject *obj)
439{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 /* XXX Perhaps we should make this API an alias of
441 PyObject_Unicode() instead ?! */
442 if (PyUnicode_CheckExact(obj)) {
443 Py_INCREF(obj);
444 return obj;
445 }
446 if (PyUnicode_Check(obj)) {
447 /* For a Unicode subtype that's not a Unicode object,
448 return a true Unicode object with the same data. */
449 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
450 PyUnicode_GET_SIZE(obj));
451 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000452 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
453}
454
455PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
456 const char *encoding,
457 const char *errors)
458{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000460 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000461 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000462
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 if (obj == NULL) {
464 PyErr_BadInternalCall();
465 return NULL;
466 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000468#if 0
469 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000470 that no encodings is given and then redirect to
471 PyObject_Unicode() which then applies the additional logic for
472 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 NOTE: This API should really only be used for object which
475 represent *encoded* Unicode !
476
477 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 if (PyUnicode_Check(obj)) {
479 if (encoding) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000485 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000486#else
487 if (PyUnicode_Check(obj)) {
488 PyErr_SetString(PyExc_TypeError,
489 "decoding Unicode is not supported");
490 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000491 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000492#endif
493
494 /* Coerce object */
495 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000496 s = PyString_AS_STRING(obj);
497 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
500 /* Overwrite the error message with something more useful in
501 case of a TypeError. */
502 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000503 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000504 "coercing to Unicode: need string or buffer, "
505 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000506 obj->ob_type->tp_name);
507 goto onError;
508 }
Tim Petersced69f82003-09-16 20:30:58 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000511 if (len == 0) {
512 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 }
Tim Petersced69f82003-09-16 20:30:58 +0000515 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000517
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518 return v;
519
520 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522}
523
524PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000525 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000526 const char *encoding,
527 const char *errors)
528{
529 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000530
531 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000532 encoding = PyUnicode_GetDefaultEncoding();
533
534 /* Shortcuts for common default encodings */
535 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000536 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000537 else if (strcmp(encoding, "latin-1") == 0)
538 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000539#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
540 else if (strcmp(encoding, "mbcs") == 0)
541 return PyUnicode_DecodeMBCS(s, size, errors);
542#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000543 else if (strcmp(encoding, "ascii") == 0)
544 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000545
546 /* Decode via the codec registry */
547 buffer = PyBuffer_FromMemory((void *)s, size);
548 if (buffer == NULL)
549 goto onError;
550 unicode = PyCodec_Decode(buffer, encoding, errors);
551 if (unicode == NULL)
552 goto onError;
553 if (!PyUnicode_Check(unicode)) {
554 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000555 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000556 unicode->ob_type->tp_name);
557 Py_DECREF(unicode);
558 goto onError;
559 }
560 Py_DECREF(buffer);
561 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000562
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 onError:
564 Py_XDECREF(buffer);
565 return NULL;
566}
567
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000568PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
569 const char *encoding,
570 const char *errors)
571{
572 PyObject *v;
573
574 if (!PyUnicode_Check(unicode)) {
575 PyErr_BadArgument();
576 goto onError;
577 }
578
579 if (encoding == NULL)
580 encoding = PyUnicode_GetDefaultEncoding();
581
582 /* Decode via the codec registry */
583 v = PyCodec_Decode(unicode, encoding, errors);
584 if (v == NULL)
585 goto onError;
586 return v;
587
588 onError:
589 return NULL;
590}
591
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000593 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000594 const char *encoding,
595 const char *errors)
596{
597 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599 unicode = PyUnicode_FromUnicode(s, size);
600 if (unicode == NULL)
601 return NULL;
602 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
603 Py_DECREF(unicode);
604 return v;
605}
606
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000607PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
608 const char *encoding,
609 const char *errors)
610{
611 PyObject *v;
612
613 if (!PyUnicode_Check(unicode)) {
614 PyErr_BadArgument();
615 goto onError;
616 }
617
618 if (encoding == NULL)
619 encoding = PyUnicode_GetDefaultEncoding();
620
621 /* Encode via the codec registry */
622 v = PyCodec_Encode(unicode, encoding, errors);
623 if (v == NULL)
624 goto onError;
625 return v;
626
627 onError:
628 return NULL;
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
632 const char *encoding,
633 const char *errors)
634{
635 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637 if (!PyUnicode_Check(unicode)) {
638 PyErr_BadArgument();
639 goto onError;
640 }
Fred Drakee4315f52000-05-09 19:53:39 +0000641
Tim Petersced69f82003-09-16 20:30:58 +0000642 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000643 encoding = PyUnicode_GetDefaultEncoding();
644
645 /* Shortcuts for common default encodings */
646 if (errors == NULL) {
647 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000648 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000649 else if (strcmp(encoding, "latin-1") == 0)
650 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000651#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
652 else if (strcmp(encoding, "mbcs") == 0)
653 return PyUnicode_AsMBCSString(unicode);
654#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000655 else if (strcmp(encoding, "ascii") == 0)
656 return PyUnicode_AsASCIIString(unicode);
657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658
659 /* Encode via the codec registry */
660 v = PyCodec_Encode(unicode, encoding, errors);
661 if (v == NULL)
662 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 if (!PyString_Check(v)) {
664 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000665 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 v->ob_type->tp_name);
667 Py_DECREF(v);
668 goto onError;
669 }
670 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000671
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 onError:
673 return NULL;
674}
675
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000676PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
677 const char *errors)
678{
679 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
680
681 if (v)
682 return v;
683 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
684 if (v && errors == NULL)
685 ((PyUnicodeObject *)unicode)->defenc = v;
686 return v;
687}
688
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
690{
691 if (!PyUnicode_Check(unicode)) {
692 PyErr_BadArgument();
693 goto onError;
694 }
695 return PyUnicode_AS_UNICODE(unicode);
696
697 onError:
698 return NULL;
699}
700
Martin v. Löwis18e16552006-02-15 17:27:45 +0000701Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000702{
703 if (!PyUnicode_Check(unicode)) {
704 PyErr_BadArgument();
705 goto onError;
706 }
707 return PyUnicode_GET_SIZE(unicode);
708
709 onError:
710 return -1;
711}
712
Thomas Wouters78890102000-07-22 19:25:51 +0000713const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000714{
715 return unicode_default_encoding;
716}
717
718int PyUnicode_SetDefaultEncoding(const char *encoding)
719{
720 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000721
Fred Drakee4315f52000-05-09 19:53:39 +0000722 /* Make sure the encoding is valid. As side effect, this also
723 loads the encoding into the codec registry cache. */
724 v = _PyCodec_Lookup(encoding);
725 if (v == NULL)
726 goto onError;
727 Py_DECREF(v);
728 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000729 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000730 sizeof(unicode_default_encoding));
731 return 0;
732
733 onError:
734 return -1;
735}
736
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000737/* error handling callback helper:
738 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000739 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000740 and adjust various state variables.
741 return 0 on success, -1 on error
742*/
743
744static
745int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
746 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000747 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
748 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000749{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000750 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000751
752 PyObject *restuple = NULL;
753 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000754 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
755 Py_ssize_t requiredsize;
756 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000758 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000759 int res = -1;
760
761 if (*errorHandler == NULL) {
762 *errorHandler = PyCodec_LookupError(errors);
763 if (*errorHandler == NULL)
764 goto onError;
765 }
766
767 if (*exceptionObject == NULL) {
768 *exceptionObject = PyUnicodeDecodeError_Create(
769 encoding, input, insize, *startinpos, *endinpos, reason);
770 if (*exceptionObject == NULL)
771 goto onError;
772 }
773 else {
774 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
777 goto onError;
778 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
779 goto onError;
780 }
781
782 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
783 if (restuple == NULL)
784 goto onError;
785 if (!PyTuple_Check(restuple)) {
786 PyErr_Format(PyExc_TypeError, &argparse[4]);
787 goto onError;
788 }
789 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
790 goto onError;
791 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000792 newpos = insize+newpos;
793 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000794 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000795 goto onError;
796 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000797
798 /* need more space? (at least enough for what we
799 have+the replacement+the rest of the string (starting
800 at the new input position), so we won't have to check space
801 when there are no errors in the rest of the string) */
802 repptr = PyUnicode_AS_UNICODE(repunicode);
803 repsize = PyUnicode_GET_SIZE(repunicode);
804 requiredsize = *outpos + repsize + insize-newpos;
805 if (requiredsize > outsize) {
806 if (requiredsize<2*outsize)
807 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000808 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809 goto onError;
810 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
811 }
812 *endinpos = newpos;
813 *inptr = input + newpos;
814 Py_UNICODE_COPY(*outptr, repptr, repsize);
815 *outptr += repsize;
816 *outpos += repsize;
817 /* we made it! */
818 res = 0;
819
820 onError:
821 Py_XDECREF(restuple);
822 return res;
823}
824
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000825/* --- UTF-7 Codec -------------------------------------------------------- */
826
827/* see RFC2152 for details */
828
Tim Petersced69f82003-09-16 20:30:58 +0000829static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000830char utf7_special[128] = {
831 /* indicate whether a UTF-7 character is special i.e. cannot be directly
832 encoded:
833 0 - not special
834 1 - special
835 2 - whitespace (optional)
836 3 - RFC2152 Set O (optional) */
837 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
838 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
839 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
843 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
844 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
845
846};
847
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000848/* Note: The comparison (c) <= 0 is a trick to work-around gcc
849 warnings about the comparison always being false; since
850 utf7_special[0] is 1, we can safely make that one comparison
851 true */
852
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000853#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000854 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000855 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000856 (encodeO && (utf7_special[(c)] == 3)))
857
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000858#define B64(n) \
859 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
860#define B64CHAR(c) \
861 (isalnum(c) || (c) == '+' || (c) == '/')
862#define UB64(c) \
863 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
864 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000865
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000866#define ENCODE(out, ch, bits) \
867 while (bits >= 6) { \
868 *out++ = B64(ch >> (bits-6)); \
869 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000870 }
871
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000872#define DECODE(out, ch, bits, surrogate) \
873 while (bits >= 16) { \
874 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
875 bits -= 16; \
876 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000877 /* We have already generated an error for the high surrogate \
878 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000879 surrogate = 0; \
880 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000882 it in a 16-bit character */ \
883 surrogate = 1; \
884 errmsg = "code pairs are not supported"; \
885 goto utf7Error; \
886 } else { \
887 *out++ = outCh; \
888 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000889 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000890
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000891PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000892 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000893 const char *errors)
894{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000895 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000896 Py_ssize_t startinpos;
897 Py_ssize_t endinpos;
898 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000899 const char *e;
900 PyUnicodeObject *unicode;
901 Py_UNICODE *p;
902 const char *errmsg = "";
903 int inShift = 0;
904 unsigned int bitsleft = 0;
905 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000906 int surrogate = 0;
907 PyObject *errorHandler = NULL;
908 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000909
910 unicode = _PyUnicode_New(size);
911 if (!unicode)
912 return NULL;
913 if (size == 0)
914 return (PyObject *)unicode;
915
916 p = unicode->str;
917 e = s + size;
918
919 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000920 Py_UNICODE ch;
921 restart:
922 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000923
924 if (inShift) {
925 if ((ch == '-') || !B64CHAR(ch)) {
926 inShift = 0;
927 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000928
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000929 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
930 if (bitsleft >= 6) {
931 /* The shift sequence has a partial character in it. If
932 bitsleft < 6 then we could just classify it as padding
933 but that is not the case here */
934
935 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000936 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 }
938 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000939 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000940 here so indicate the potential of a misencoded character. */
941
942 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
943 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
944 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000945 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946 }
947
948 if (ch == '-') {
949 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000950 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951 inShift = 1;
952 }
953 } else if (SPECIAL(ch,0,0)) {
954 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000955 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000956 } else {
957 *p++ = ch;
958 }
959 } else {
960 charsleft = (charsleft << 6) | UB64(ch);
961 bitsleft += 6;
962 s++;
963 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
964 }
965 }
966 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000967 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968 s++;
969 if (s < e && *s == '-') {
970 s++;
971 *p++ = '+';
972 } else
973 {
974 inShift = 1;
975 bitsleft = 0;
976 }
977 }
978 else if (SPECIAL(ch,0,0)) {
979 errmsg = "unexpected special character";
980 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000981 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000982 }
983 else {
984 *p++ = ch;
985 s++;
986 }
987 continue;
988 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000989 outpos = p-PyUnicode_AS_UNICODE(unicode);
990 endinpos = s-starts;
991 if (unicode_decode_call_errorhandler(
992 errors, &errorHandler,
993 "utf7", errmsg,
994 starts, size, &startinpos, &endinpos, &exc, &s,
995 (PyObject **)&unicode, &outpos, &p))
996 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001000 outpos = p-PyUnicode_AS_UNICODE(unicode);
1001 endinpos = size;
1002 if (unicode_decode_call_errorhandler(
1003 errors, &errorHandler,
1004 "utf7", "unterminated shift sequence",
1005 starts, size, &startinpos, &endinpos, &exc, &s,
1006 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001008 if (s < e)
1009 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001010 }
1011
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001012 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001013 goto onError;
1014
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001015 Py_XDECREF(errorHandler);
1016 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001017 return (PyObject *)unicode;
1018
1019onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001020 Py_XDECREF(errorHandler);
1021 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001022 Py_DECREF(unicode);
1023 return NULL;
1024}
1025
1026
1027PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001028 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001029 int encodeSetO,
1030 int encodeWhiteSpace,
1031 const char *errors)
1032{
1033 PyObject *v;
1034 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001035 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001036 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001037 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 unsigned int bitsleft = 0;
1039 unsigned long charsleft = 0;
1040 char * out;
1041 char * start;
1042
1043 if (size == 0)
1044 return PyString_FromStringAndSize(NULL, 0);
1045
1046 v = PyString_FromStringAndSize(NULL, cbAllocated);
1047 if (v == NULL)
1048 return NULL;
1049
1050 start = out = PyString_AS_STRING(v);
1051 for (;i < size; ++i) {
1052 Py_UNICODE ch = s[i];
1053
1054 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001055 if (ch == '+') {
1056 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001057 *out++ = '-';
1058 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1059 charsleft = ch;
1060 bitsleft = 16;
1061 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001062 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001064 } else {
1065 *out++ = (char) ch;
1066 }
1067 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1069 *out++ = B64(charsleft << (6-bitsleft));
1070 charsleft = 0;
1071 bitsleft = 0;
1072 /* Characters not in the BASE64 set implicitly unshift the sequence
1073 so no '-' is required, except if the character is itself a '-' */
1074 if (B64CHAR(ch) || ch == '-') {
1075 *out++ = '-';
1076 }
1077 inShift = 0;
1078 *out++ = (char) ch;
1079 } else {
1080 bitsleft += 16;
1081 charsleft = (charsleft << 16) | ch;
1082 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1083
1084 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001085 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001086 or '-' then the shift sequence will be terminated implicitly and we
1087 don't have to insert a '-'. */
1088
1089 if (bitsleft == 0) {
1090 if (i + 1 < size) {
1091 Py_UNICODE ch2 = s[i+1];
1092
1093 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001094
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001095 } else if (B64CHAR(ch2) || ch2 == '-') {
1096 *out++ = '-';
1097 inShift = 0;
1098 } else {
1099 inShift = 0;
1100 }
1101
1102 }
1103 else {
1104 *out++ = '-';
1105 inShift = 0;
1106 }
1107 }
Tim Petersced69f82003-09-16 20:30:58 +00001108 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001110 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001111 if (bitsleft) {
1112 *out++= B64(charsleft << (6-bitsleft) );
1113 *out++ = '-';
1114 }
1115
Tim Peters5de98422002-04-27 18:44:32 +00001116 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001117 return v;
1118}
1119
1120#undef SPECIAL
1121#undef B64
1122#undef B64CHAR
1123#undef UB64
1124#undef ENCODE
1125#undef DECODE
1126
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127/* --- UTF-8 Codec -------------------------------------------------------- */
1128
Tim Petersced69f82003-09-16 20:30:58 +00001129static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130char utf8_code_length[256] = {
1131 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1132 illegal prefix. see RFC 2279 for details */
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1140 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1145 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1146 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1147 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1148 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1149};
1150
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001152 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 const char *errors)
1154{
Walter Dörwald69652032004-09-07 20:24:22 +00001155 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1156}
1157
1158PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001159 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001160 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001161 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001162{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001163 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001165 Py_ssize_t startinpos;
1166 Py_ssize_t endinpos;
1167 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 const char *e;
1169 PyUnicodeObject *unicode;
1170 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001172 PyObject *errorHandler = NULL;
1173 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
1175 /* Note: size will always be longer than the resulting Unicode
1176 character count */
1177 unicode = _PyUnicode_New(size);
1178 if (!unicode)
1179 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001180 if (size == 0) {
1181 if (consumed)
1182 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185
1186 /* Unpack UTF-8 encoded data */
1187 p = unicode->str;
1188 e = s + size;
1189
1190 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001191 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192
1193 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001194 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 s++;
1196 continue;
1197 }
1198
1199 n = utf8_code_length[ch];
1200
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001201 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001202 if (consumed)
1203 break;
1204 else {
1205 errmsg = "unexpected end of data";
1206 startinpos = s-starts;
1207 endinpos = size;
1208 goto utf8Error;
1209 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211
1212 switch (n) {
1213
1214 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001216 startinpos = s-starts;
1217 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001218 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219
1220 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001221 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001222 startinpos = s-starts;
1223 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 if ((s[1] & 0xc0) != 0x80) {
1228 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001229 startinpos = s-starts;
1230 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001231 goto utf8Error;
1232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001234 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001235 startinpos = s-starts;
1236 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001237 errmsg = "illegal encoding";
1238 goto utf8Error;
1239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001241 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 break;
1243
1244 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001245 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001246 (s[2] & 0xc0) != 0x80) {
1247 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248 startinpos = s-starts;
1249 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001250 goto utf8Error;
1251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001253 if (ch < 0x0800) {
1254 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001255 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001256
1257 XXX For wide builds (UCS-4) we should probably try
1258 to recombine the surrogates into a single code
1259 unit.
1260 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262 startinpos = s-starts;
1263 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001264 goto utf8Error;
1265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001267 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001268 break;
1269
1270 case 4:
1271 if ((s[1] & 0xc0) != 0x80 ||
1272 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001273 (s[3] & 0xc0) != 0x80) {
1274 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001275 startinpos = s-starts;
1276 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 goto utf8Error;
1278 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001279 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1280 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1281 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001282 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001283 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001284 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001285 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001286 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288 startinpos = s-starts;
1289 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 goto utf8Error;
1291 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001292#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001293 *p++ = (Py_UNICODE)ch;
1294#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001295 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001296
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001297 /* translate from 10000..10FFFF to 0..FFFF */
1298 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001299
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001300 /* high surrogate = top 10 bits added to D800 */
1301 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001302
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001303 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001304 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001305#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 break;
1307
1308 default:
1309 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001310 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001311 startinpos = s-starts;
1312 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001313 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 }
1315 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001317
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001318 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001319 outpos = p-PyUnicode_AS_UNICODE(unicode);
1320 if (unicode_decode_call_errorhandler(
1321 errors, &errorHandler,
1322 "utf8", errmsg,
1323 starts, size, &startinpos, &endinpos, &exc, &s,
1324 (PyObject **)&unicode, &outpos, &p))
1325 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326 }
Walter Dörwald69652032004-09-07 20:24:22 +00001327 if (consumed)
1328 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329
1330 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001331 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332 goto onError;
1333
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334 Py_XDECREF(errorHandler);
1335 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336 return (PyObject *)unicode;
1337
1338onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 Py_XDECREF(errorHandler);
1340 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 Py_DECREF(unicode);
1342 return NULL;
1343}
1344
Tim Peters602f7402002-04-27 18:03:26 +00001345/* Allocation strategy: if the string is short, convert into a stack buffer
1346 and allocate exactly as much space needed at the end. Else allocate the
1347 maximum possible needed (4 result bytes per Unicode character), and return
1348 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001349*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001350PyObject *
1351PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001352 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354{
Tim Peters602f7402002-04-27 18:03:26 +00001355#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001356
Martin v. Löwis18e16552006-02-15 17:27:45 +00001357 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001358 PyObject *v; /* result string object */
1359 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001360 Py_ssize_t nallocated; /* number of result bytes allocated */
Tim Peters602f7402002-04-27 18:03:26 +00001361 int nneeded; /* number of result bytes needed */
1362 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001363
Tim Peters602f7402002-04-27 18:03:26 +00001364 assert(s != NULL);
1365 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366
Tim Peters602f7402002-04-27 18:03:26 +00001367 if (size <= MAX_SHORT_UNICHARS) {
1368 /* Write into the stack buffer; nallocated can't overflow.
1369 * At the end, we'll allocate exactly as much heap space as it
1370 * turns out we need.
1371 */
1372 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1373 v = NULL; /* will allocate after we're done */
1374 p = stackbuf;
1375 }
1376 else {
1377 /* Overallocate on the heap, and give the excess back at the end. */
1378 nallocated = size * 4;
1379 if (nallocated / 4 != size) /* overflow! */
1380 return PyErr_NoMemory();
1381 v = PyString_FromStringAndSize(NULL, nallocated);
1382 if (v == NULL)
1383 return NULL;
1384 p = PyString_AS_STRING(v);
1385 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001386
Tim Peters602f7402002-04-27 18:03:26 +00001387 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001388 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001389
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001390 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001391 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001393
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001395 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001396 *p++ = (char)(0xc0 | (ch >> 6));
1397 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001398 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001399 else {
Tim Peters602f7402002-04-27 18:03:26 +00001400 /* Encode UCS2 Unicode ordinals */
1401 if (ch < 0x10000) {
1402 /* Special case: check for high surrogate */
1403 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1404 Py_UCS4 ch2 = s[i];
1405 /* Check for low surrogate and combine the two to
1406 form a UCS4 value */
1407 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001408 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001409 i++;
1410 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001411 }
Tim Peters602f7402002-04-27 18:03:26 +00001412 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001413 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001414 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001415 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1416 *p++ = (char)(0x80 | (ch & 0x3f));
1417 continue;
1418 }
1419encodeUCS4:
1420 /* Encode UCS4 Unicode ordinals */
1421 *p++ = (char)(0xf0 | (ch >> 18));
1422 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1423 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1424 *p++ = (char)(0x80 | (ch & 0x3f));
1425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001427
Tim Peters602f7402002-04-27 18:03:26 +00001428 if (v == NULL) {
1429 /* This was stack allocated. */
1430 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1431 assert(nneeded <= nallocated);
1432 v = PyString_FromStringAndSize(stackbuf, nneeded);
1433 }
1434 else {
1435 /* Cut back to size actually needed. */
1436 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1437 assert(nneeded <= nallocated);
1438 _PyString_Resize(&v, nneeded);
1439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441
Tim Peters602f7402002-04-27 18:03:26 +00001442#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443}
1444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1446{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 if (!PyUnicode_Check(unicode)) {
1448 PyErr_BadArgument();
1449 return NULL;
1450 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001451 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1452 PyUnicode_GET_SIZE(unicode),
1453 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454}
1455
1456/* --- UTF-16 Codec ------------------------------------------------------- */
1457
Tim Peters772747b2001-08-09 22:21:55 +00001458PyObject *
1459PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001460 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001461 const char *errors,
1462 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463{
Walter Dörwald69652032004-09-07 20:24:22 +00001464 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1465}
1466
1467PyObject *
1468PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001469 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001470 const char *errors,
1471 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001473{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001474 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t startinpos;
1476 Py_ssize_t endinpos;
1477 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478 PyUnicodeObject *unicode;
1479 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001480 const unsigned char *q, *e;
1481 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001482 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001483 /* Offsets from q for retrieving byte pairs in the right order. */
1484#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1485 int ihi = 1, ilo = 0;
1486#else
1487 int ihi = 0, ilo = 1;
1488#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 PyObject *errorHandler = NULL;
1490 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491
1492 /* Note: size will always be longer than the resulting Unicode
1493 character count */
1494 unicode = _PyUnicode_New(size);
1495 if (!unicode)
1496 return NULL;
1497 if (size == 0)
1498 return (PyObject *)unicode;
1499
1500 /* Unpack UTF-16 encoded data */
1501 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001502 q = (unsigned char *)s;
1503 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504
1505 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001506 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001508 /* Check for BOM marks (U+FEFF) in the input and adjust current
1509 byte order setting accordingly. In native mode, the leading BOM
1510 mark is skipped, in all other modes, it is copied to the output
1511 stream as-is (giving a ZWNBSP character). */
1512 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001513 if (size >= 2) {
1514 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001515#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001516 if (bom == 0xFEFF) {
1517 q += 2;
1518 bo = -1;
1519 }
1520 else if (bom == 0xFFFE) {
1521 q += 2;
1522 bo = 1;
1523 }
Tim Petersced69f82003-09-16 20:30:58 +00001524#else
Walter Dörwald69652032004-09-07 20:24:22 +00001525 if (bom == 0xFEFF) {
1526 q += 2;
1527 bo = 1;
1528 }
1529 else if (bom == 0xFFFE) {
1530 q += 2;
1531 bo = -1;
1532 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001533#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001534 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536
Tim Peters772747b2001-08-09 22:21:55 +00001537 if (bo == -1) {
1538 /* force LE */
1539 ihi = 1;
1540 ilo = 0;
1541 }
1542 else if (bo == 1) {
1543 /* force BE */
1544 ihi = 0;
1545 ilo = 1;
1546 }
1547
1548 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001550 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001552 if (consumed)
1553 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001554 errmsg = "truncated data";
1555 startinpos = ((const char *)q)-starts;
1556 endinpos = ((const char *)e)-starts;
1557 goto utf16Error;
1558 /* The remaining input chars are ignored if the callback
1559 chooses to skip the input */
1560 }
1561 ch = (q[ihi] << 8) | q[ilo];
1562
Tim Peters772747b2001-08-09 22:21:55 +00001563 q += 2;
1564
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 if (ch < 0xD800 || ch > 0xDFFF) {
1566 *p++ = ch;
1567 continue;
1568 }
1569
1570 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001571 if (q >= e) {
1572 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001573 startinpos = (((const char *)q)-2)-starts;
1574 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001575 goto utf16Error;
1576 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001577 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001578 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1579 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001580 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001581#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001582 *p++ = ch;
1583 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001584#else
1585 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001586#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001587 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001588 }
1589 else {
1590 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001591 startinpos = (((const char *)q)-4)-starts;
1592 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001593 goto utf16Error;
1594 }
1595
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001597 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 startinpos = (((const char *)q)-2)-starts;
1599 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001600 /* Fall through to report the error */
1601
1602 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001603 outpos = p-PyUnicode_AS_UNICODE(unicode);
1604 if (unicode_decode_call_errorhandler(
1605 errors, &errorHandler,
1606 "utf16", errmsg,
1607 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1608 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001609 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 }
1611
1612 if (byteorder)
1613 *byteorder = bo;
1614
Walter Dörwald69652032004-09-07 20:24:22 +00001615 if (consumed)
1616 *consumed = (const char *)q-starts;
1617
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001619 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 goto onError;
1621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 Py_XDECREF(errorHandler);
1623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624 return (PyObject *)unicode;
1625
1626onError:
1627 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_XDECREF(errorHandler);
1629 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 return NULL;
1631}
1632
Tim Peters772747b2001-08-09 22:21:55 +00001633PyObject *
1634PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001635 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001636 const char *errors,
1637 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638{
1639 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001640 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001641#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001642 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001643#else
1644 const int pairs = 0;
1645#endif
Tim Peters772747b2001-08-09 22:21:55 +00001646 /* Offsets from p for storing byte pairs in the right order. */
1647#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1648 int ihi = 1, ilo = 0;
1649#else
1650 int ihi = 0, ilo = 1;
1651#endif
1652
1653#define STORECHAR(CH) \
1654 do { \
1655 p[ihi] = ((CH) >> 8) & 0xff; \
1656 p[ilo] = (CH) & 0xff; \
1657 p += 2; \
1658 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001660#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001661 for (i = pairs = 0; i < size; i++)
1662 if (s[i] >= 0x10000)
1663 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001664#endif
Tim Petersced69f82003-09-16 20:30:58 +00001665 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001666 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667 if (v == NULL)
1668 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669
Tim Peters772747b2001-08-09 22:21:55 +00001670 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001672 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001673 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001674 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001675
1676 if (byteorder == -1) {
1677 /* force LE */
1678 ihi = 1;
1679 ilo = 0;
1680 }
1681 else if (byteorder == 1) {
1682 /* force BE */
1683 ihi = 0;
1684 ilo = 1;
1685 }
1686
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001687 while (size-- > 0) {
1688 Py_UNICODE ch = *s++;
1689 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001690#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001691 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001692 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1693 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001695#endif
Tim Peters772747b2001-08-09 22:21:55 +00001696 STORECHAR(ch);
1697 if (ch2)
1698 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001701#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702}
1703
1704PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1705{
1706 if (!PyUnicode_Check(unicode)) {
1707 PyErr_BadArgument();
1708 return NULL;
1709 }
1710 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1711 PyUnicode_GET_SIZE(unicode),
1712 NULL,
1713 0);
1714}
1715
1716/* --- Unicode Escape Codec ----------------------------------------------- */
1717
Fredrik Lundh06d12682001-01-24 07:59:11 +00001718static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001719
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001721 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 const char *errors)
1723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001724 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001725 Py_ssize_t startinpos;
1726 Py_ssize_t endinpos;
1727 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001732 char* message;
1733 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001734 PyObject *errorHandler = NULL;
1735 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001736
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 /* Escaped strings will always be longer than the resulting
1738 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 length after conversion to the true value.
1740 (but if the error callback returns a long replacement string
1741 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 v = _PyUnicode_New(size);
1743 if (v == NULL)
1744 goto onError;
1745 if (size == 0)
1746 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001750
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 while (s < end) {
1752 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001753 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755
1756 /* Non-escape characters are interpreted as Unicode ordinals */
1757 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001758 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 continue;
1760 }
1761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001762 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 /* \ - Escapes */
1764 s++;
1765 switch (*s++) {
1766
1767 /* \x escapes */
1768 case '\n': break;
1769 case '\\': *p++ = '\\'; break;
1770 case '\'': *p++ = '\''; break;
1771 case '\"': *p++ = '\"'; break;
1772 case 'b': *p++ = '\b'; break;
1773 case 'f': *p++ = '\014'; break; /* FF */
1774 case 't': *p++ = '\t'; break;
1775 case 'n': *p++ = '\n'; break;
1776 case 'r': *p++ = '\r'; break;
1777 case 'v': *p++ = '\013'; break; /* VT */
1778 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1779
1780 /* \OOO (octal) escapes */
1781 case '0': case '1': case '2': case '3':
1782 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001783 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001785 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001787 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001789 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 break;
1791
Fredrik Lundhccc74732001-02-18 22:13:49 +00001792 /* hex escapes */
1793 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001795 digits = 2;
1796 message = "truncated \\xXX escape";
1797 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
Fredrik Lundhccc74732001-02-18 22:13:49 +00001799 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801 digits = 4;
1802 message = "truncated \\uXXXX escape";
1803 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804
Fredrik Lundhccc74732001-02-18 22:13:49 +00001805 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001806 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001807 digits = 8;
1808 message = "truncated \\UXXXXXXXX escape";
1809 hexescape:
1810 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 outpos = p-PyUnicode_AS_UNICODE(v);
1812 if (s+digits>end) {
1813 endinpos = size;
1814 if (unicode_decode_call_errorhandler(
1815 errors, &errorHandler,
1816 "unicodeescape", "end of string in escape sequence",
1817 starts, size, &startinpos, &endinpos, &exc, &s,
1818 (PyObject **)&v, &outpos, &p))
1819 goto onError;
1820 goto nextByte;
1821 }
1822 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001823 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001824 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 endinpos = (s+i+1)-starts;
1826 if (unicode_decode_call_errorhandler(
1827 errors, &errorHandler,
1828 "unicodeescape", message,
1829 starts, size, &startinpos, &endinpos, &exc, &s,
1830 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001831 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001833 }
1834 chr = (chr<<4) & ~0xF;
1835 if (c >= '0' && c <= '9')
1836 chr += c - '0';
1837 else if (c >= 'a' && c <= 'f')
1838 chr += 10 + c - 'a';
1839 else
1840 chr += 10 + c - 'A';
1841 }
1842 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001843 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 /* _decoding_error will have already written into the
1845 target buffer. */
1846 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001848 /* when we get here, chr is a 32-bit unicode character */
1849 if (chr <= 0xffff)
1850 /* UCS-2 character */
1851 *p++ = (Py_UNICODE) chr;
1852 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001853 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001854 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001855#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001856 *p++ = chr;
1857#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001858 chr -= 0x10000L;
1859 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001860 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001861#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001862 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 endinpos = s-starts;
1864 outpos = p-PyUnicode_AS_UNICODE(v);
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "illegal Unicode character",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001870 goto onError;
1871 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001872 break;
1873
1874 /* \N{name} */
1875 case 'N':
1876 message = "malformed \\N character escape";
1877 if (ucnhash_CAPI == NULL) {
1878 /* load the unicode data module */
1879 PyObject *m, *v;
1880 m = PyImport_ImportModule("unicodedata");
1881 if (m == NULL)
1882 goto ucnhashError;
1883 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1884 Py_DECREF(m);
1885 if (v == NULL)
1886 goto ucnhashError;
1887 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1888 Py_DECREF(v);
1889 if (ucnhash_CAPI == NULL)
1890 goto ucnhashError;
1891 }
1892 if (*s == '{') {
1893 const char *start = s+1;
1894 /* look for the closing brace */
1895 while (*s != '}' && s < end)
1896 s++;
1897 if (s > start && s < end && *s == '}') {
1898 /* found a name. look it up in the unicode database */
1899 message = "unknown Unicode character name";
1900 s++;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 if (ucnhash_CAPI->getcode(start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001902 goto store;
1903 }
1904 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001905 endinpos = s-starts;
1906 outpos = p-PyUnicode_AS_UNICODE(v);
1907 if (unicode_decode_call_errorhandler(
1908 errors, &errorHandler,
1909 "unicodeescape", message,
1910 starts, size, &startinpos, &endinpos, &exc, &s,
1911 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001912 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001913 break;
1914
1915 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001916 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001917 message = "\\ at end of string";
1918 s--;
1919 endinpos = s-starts;
1920 outpos = p-PyUnicode_AS_UNICODE(v);
1921 if (unicode_decode_call_errorhandler(
1922 errors, &errorHandler,
1923 "unicodeescape", message,
1924 starts, size, &startinpos, &endinpos, &exc, &s,
1925 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001926 goto onError;
1927 }
1928 else {
1929 *p++ = '\\';
1930 *p++ = (unsigned char)s[-1];
1931 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001932 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 nextByte:
1935 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001937 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001939 Py_XDECREF(errorHandler);
1940 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001942
Fredrik Lundhccc74732001-02-18 22:13:49 +00001943ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001944 PyErr_SetString(
1945 PyExc_UnicodeError,
1946 "\\N escapes not supported (can't load unicodedata module)"
1947 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 Py_XDECREF(errorHandler);
1949 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001950 return NULL;
1951
Fredrik Lundhccc74732001-02-18 22:13:49 +00001952onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001954 Py_XDECREF(errorHandler);
1955 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 return NULL;
1957}
1958
1959/* Return a Unicode-Escape string version of the Unicode object.
1960
1961 If quotes is true, the string is enclosed in u"" or u'' quotes as
1962 appropriate.
1963
1964*/
1965
Barry Warsaw51ac5802000-03-20 16:36:48 +00001966static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001967 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00001968 Py_UNICODE ch);
1969
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970static
1971PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001972 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 int quotes)
1974{
1975 PyObject *repr;
1976 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001978 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979
1980 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1981 if (repr == NULL)
1982 return NULL;
1983
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001984 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985
1986 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001988 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 !findchar(s, size, '"')) ? '"' : '\'';
1990 }
1991 while (size-- > 0) {
1992 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001993
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001994 /* Escape quotes and backslashes */
1995 if ((quotes &&
1996 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 *p++ = '\\';
1998 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001999 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002000 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002001
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002002#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002003 /* Map 21-bit characters to '\U00xxxxxx' */
2004 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002005 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002006
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002007 /* Resize the string if necessary */
2008 if (offset + 12 > PyString_GET_SIZE(repr)) {
2009 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002010 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002011 p = PyString_AS_STRING(repr) + offset;
2012 }
2013
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002014 *p++ = '\\';
2015 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002016 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2021 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2022 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002023 *p++ = hexdigit[ch & 0x0000000F];
2024 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002025 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002026#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002027 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2028 else if (ch >= 0xD800 && ch < 0xDC00) {
2029 Py_UNICODE ch2;
2030 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002032 ch2 = *s++;
2033 size--;
2034 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2035 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2036 *p++ = '\\';
2037 *p++ = 'U';
2038 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2043 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2044 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2045 *p++ = hexdigit[ucs & 0x0000000F];
2046 continue;
2047 }
2048 /* Fall through: isolated surrogates are copied as-is */
2049 s--;
2050 size++;
2051 }
2052
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002054 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 *p++ = '\\';
2056 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002057 *p++ = hexdigit[(ch >> 12) & 0x000F];
2058 *p++ = hexdigit[(ch >> 8) & 0x000F];
2059 *p++ = hexdigit[(ch >> 4) & 0x000F];
2060 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002062
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002063 /* Map special whitespace to '\t', \n', '\r' */
2064 else if (ch == '\t') {
2065 *p++ = '\\';
2066 *p++ = 't';
2067 }
2068 else if (ch == '\n') {
2069 *p++ = '\\';
2070 *p++ = 'n';
2071 }
2072 else if (ch == '\r') {
2073 *p++ = '\\';
2074 *p++ = 'r';
2075 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002076
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002077 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002078 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002080 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002081 *p++ = hexdigit[(ch >> 4) & 0x000F];
2082 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002083 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002084
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 /* Copy everything else as-is */
2086 else
2087 *p++ = (char) ch;
2088 }
2089 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002090 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091
2092 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002093 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 return repr;
2095}
2096
2097PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002098 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099{
2100 return unicodeescape_string(s, size, 0);
2101}
2102
2103PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2104{
2105 if (!PyUnicode_Check(unicode)) {
2106 PyErr_BadArgument();
2107 return NULL;
2108 }
2109 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2110 PyUnicode_GET_SIZE(unicode));
2111}
2112
2113/* --- Raw Unicode Escape Codec ------------------------------------------- */
2114
2115PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002116 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 const char *errors)
2118{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002120 Py_ssize_t startinpos;
2121 Py_ssize_t endinpos;
2122 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 const char *end;
2126 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 PyObject *errorHandler = NULL;
2128 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002129
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 /* Escaped strings will always be longer than the resulting
2131 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 length after conversion to the true value. (But decoding error
2133 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 v = _PyUnicode_New(size);
2135 if (v == NULL)
2136 goto onError;
2137 if (size == 0)
2138 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002139 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 end = s + size;
2141 while (s < end) {
2142 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002143 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002145 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146
2147 /* Non-escape characters are interpreted as Unicode ordinals */
2148 if (*s != '\\') {
2149 *p++ = (unsigned char)*s++;
2150 continue;
2151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153
2154 /* \u-escapes are only interpreted iff the number of leading
2155 backslashes if odd */
2156 bs = s;
2157 for (;s < end;) {
2158 if (*s != '\\')
2159 break;
2160 *p++ = (unsigned char)*s++;
2161 }
2162 if (((s - bs) & 1) == 0 ||
2163 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002164 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 continue;
2166 }
2167 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002168 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 s++;
2170
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002171 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002173 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 endinpos = s-starts;
2177 if (unicode_decode_call_errorhandler(
2178 errors, &errorHandler,
2179 "rawunicodeescape", "truncated \\uXXXX",
2180 starts, size, &startinpos, &endinpos, &exc, &s,
2181 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184 }
2185 x = (x<<4) & ~0xF;
2186 if (c >= '0' && c <= '9')
2187 x += c - '0';
2188 else if (c >= 'a' && c <= 'f')
2189 x += 10 + c - 'a';
2190 else
2191 x += 10 + c - 'A';
2192 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002193#ifndef Py_UNICODE_WIDE
2194 if (x > 0x10000) {
2195 if (unicode_decode_call_errorhandler(
2196 errors, &errorHandler,
2197 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2198 starts, size, &startinpos, &endinpos, &exc, &s,
2199 (PyObject **)&v, &outpos, &p))
2200 goto onError;
2201 }
2202#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002203 *p++ = x;
2204 nextByte:
2205 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002207 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002209 Py_XDECREF(errorHandler);
2210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002212
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 onError:
2214 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 Py_XDECREF(errorHandler);
2216 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 return NULL;
2218}
2219
2220PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002221 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222{
2223 PyObject *repr;
2224 char *p;
2225 char *q;
2226
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002227 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002229#ifdef Py_UNICODE_WIDE
2230 repr = PyString_FromStringAndSize(NULL, 10 * size);
2231#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002233#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 if (repr == NULL)
2235 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002236 if (size == 0)
2237 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238
2239 p = q = PyString_AS_STRING(repr);
2240 while (size-- > 0) {
2241 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002242#ifdef Py_UNICODE_WIDE
2243 /* Map 32-bit characters to '\Uxxxxxxxx' */
2244 if (ch >= 0x10000) {
2245 *p++ = '\\';
2246 *p++ = 'U';
2247 *p++ = hexdigit[(ch >> 28) & 0xf];
2248 *p++ = hexdigit[(ch >> 24) & 0xf];
2249 *p++ = hexdigit[(ch >> 20) & 0xf];
2250 *p++ = hexdigit[(ch >> 16) & 0xf];
2251 *p++ = hexdigit[(ch >> 12) & 0xf];
2252 *p++ = hexdigit[(ch >> 8) & 0xf];
2253 *p++ = hexdigit[(ch >> 4) & 0xf];
2254 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256 else
2257#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 /* Map 16-bit characters to '\uxxxx' */
2259 if (ch >= 256) {
2260 *p++ = '\\';
2261 *p++ = 'u';
2262 *p++ = hexdigit[(ch >> 12) & 0xf];
2263 *p++ = hexdigit[(ch >> 8) & 0xf];
2264 *p++ = hexdigit[(ch >> 4) & 0xf];
2265 *p++ = hexdigit[ch & 15];
2266 }
2267 /* Copy everything else as-is */
2268 else
2269 *p++ = (char) ch;
2270 }
2271 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002272 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 return repr;
2274}
2275
2276PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2277{
2278 if (!PyUnicode_Check(unicode)) {
2279 PyErr_BadArgument();
2280 return NULL;
2281 }
2282 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2283 PyUnicode_GET_SIZE(unicode));
2284}
2285
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002286/* --- Unicode Internal Codec ------------------------------------------- */
2287
2288PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002289 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002290 const char *errors)
2291{
2292 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002293 Py_ssize_t startinpos;
2294 Py_ssize_t endinpos;
2295 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002296 PyUnicodeObject *v;
2297 Py_UNICODE *p;
2298 const char *end;
2299 const char *reason;
2300 PyObject *errorHandler = NULL;
2301 PyObject *exc = NULL;
2302
Neal Norwitzd43069c2006-01-08 01:12:10 +00002303#ifdef Py_UNICODE_WIDE
2304 Py_UNICODE unimax = PyUnicode_GetMax();
2305#endif
2306
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002307 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2308 if (v == NULL)
2309 goto onError;
2310 if (PyUnicode_GetSize((PyObject *)v) == 0)
2311 return (PyObject *)v;
2312 p = PyUnicode_AS_UNICODE(v);
2313 end = s + size;
2314
2315 while (s < end) {
2316 *p = *(Py_UNICODE *)s;
2317 /* We have to sanity check the raw data, otherwise doom looms for
2318 some malformed UCS-4 data. */
2319 if (
2320 #ifdef Py_UNICODE_WIDE
2321 *p > unimax || *p < 0 ||
2322 #endif
2323 end-s < Py_UNICODE_SIZE
2324 )
2325 {
2326 startinpos = s - starts;
2327 if (end-s < Py_UNICODE_SIZE) {
2328 endinpos = end-starts;
2329 reason = "truncated input";
2330 }
2331 else {
2332 endinpos = s - starts + Py_UNICODE_SIZE;
2333 reason = "illegal code point (> 0x10FFFF)";
2334 }
2335 outpos = p - PyUnicode_AS_UNICODE(v);
2336 if (unicode_decode_call_errorhandler(
2337 errors, &errorHandler,
2338 "unicode_internal", reason,
2339 starts, size, &startinpos, &endinpos, &exc, &s,
2340 (PyObject **)&v, &outpos, &p)) {
2341 goto onError;
2342 }
2343 }
2344 else {
2345 p++;
2346 s += Py_UNICODE_SIZE;
2347 }
2348 }
2349
2350 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2351 goto onError;
2352 Py_XDECREF(errorHandler);
2353 Py_XDECREF(exc);
2354 return (PyObject *)v;
2355
2356 onError:
2357 Py_XDECREF(v);
2358 Py_XDECREF(errorHandler);
2359 Py_XDECREF(exc);
2360 return NULL;
2361}
2362
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363/* --- Latin-1 Codec ------------------------------------------------------ */
2364
2365PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002366 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367 const char *errors)
2368{
2369 PyUnicodeObject *v;
2370 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002371
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002373 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002374 Py_UNICODE r = *(unsigned char*)s;
2375 return PyUnicode_FromUnicode(&r, 1);
2376 }
2377
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378 v = _PyUnicode_New(size);
2379 if (v == NULL)
2380 goto onError;
2381 if (size == 0)
2382 return (PyObject *)v;
2383 p = PyUnicode_AS_UNICODE(v);
2384 while (size-- > 0)
2385 *p++ = (unsigned char)*s++;
2386 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002387
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388 onError:
2389 Py_XDECREF(v);
2390 return NULL;
2391}
2392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002393/* create or adjust a UnicodeEncodeError */
2394static void make_encode_exception(PyObject **exceptionObject,
2395 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002396 const Py_UNICODE *unicode, Py_ssize_t size,
2397 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002400 if (*exceptionObject == NULL) {
2401 *exceptionObject = PyUnicodeEncodeError_Create(
2402 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 }
2404 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002405 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2406 goto onError;
2407 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2408 goto onError;
2409 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2410 goto onError;
2411 return;
2412 onError:
2413 Py_DECREF(*exceptionObject);
2414 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 }
2416}
2417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002418/* raises a UnicodeEncodeError */
2419static void raise_encode_exception(PyObject **exceptionObject,
2420 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002421 const Py_UNICODE *unicode, Py_ssize_t size,
2422 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002423 const char *reason)
2424{
2425 make_encode_exception(exceptionObject,
2426 encoding, unicode, size, startpos, endpos, reason);
2427 if (*exceptionObject != NULL)
2428 PyCodec_StrictErrors(*exceptionObject);
2429}
2430
2431/* error handling callback helper:
2432 build arguments, call the callback and check the arguments,
2433 put the result into newpos and return the replacement string, which
2434 has to be freed by the caller */
2435static PyObject *unicode_encode_call_errorhandler(const char *errors,
2436 PyObject **errorHandler,
2437 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002438 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2439 Py_ssize_t startpos, Py_ssize_t endpos,
2440 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002442 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002443
2444 PyObject *restuple;
2445 PyObject *resunicode;
2446
2447 if (*errorHandler == NULL) {
2448 *errorHandler = PyCodec_LookupError(errors);
2449 if (*errorHandler == NULL)
2450 return NULL;
2451 }
2452
2453 make_encode_exception(exceptionObject,
2454 encoding, unicode, size, startpos, endpos, reason);
2455 if (*exceptionObject == NULL)
2456 return NULL;
2457
2458 restuple = PyObject_CallFunctionObjArgs(
2459 *errorHandler, *exceptionObject, NULL);
2460 if (restuple == NULL)
2461 return NULL;
2462 if (!PyTuple_Check(restuple)) {
2463 PyErr_Format(PyExc_TypeError, &argparse[4]);
2464 Py_DECREF(restuple);
2465 return NULL;
2466 }
2467 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2468 &resunicode, newpos)) {
2469 Py_DECREF(restuple);
2470 return NULL;
2471 }
2472 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002473 *newpos = size+*newpos;
2474 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002475 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002476 Py_DECREF(restuple);
2477 return NULL;
2478 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 Py_INCREF(resunicode);
2480 Py_DECREF(restuple);
2481 return resunicode;
2482}
2483
2484static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002485 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 const char *errors,
2487 int limit)
2488{
2489 /* output object */
2490 PyObject *res;
2491 /* pointers to the beginning and end+1 of input */
2492 const Py_UNICODE *startp = p;
2493 const Py_UNICODE *endp = p + size;
2494 /* pointer to the beginning of the unencodable characters */
2495 /* const Py_UNICODE *badp = NULL; */
2496 /* pointer into the output */
2497 char *str;
2498 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002499 Py_ssize_t respos = 0;
2500 Py_ssize_t ressize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2502 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2503 PyObject *errorHandler = NULL;
2504 PyObject *exc = NULL;
2505 /* the following variable is used for caching string comparisons
2506 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2507 int known_errorHandler = -1;
2508
2509 /* allocate enough for a simple encoding without
2510 replacements, if we need more, we'll resize */
2511 res = PyString_FromStringAndSize(NULL, size);
2512 if (res == NULL)
2513 goto onError;
2514 if (size == 0)
2515 return res;
2516 str = PyString_AS_STRING(res);
2517 ressize = size;
2518
2519 while (p<endp) {
2520 Py_UNICODE c = *p;
2521
2522 /* can we encode this? */
2523 if (c<limit) {
2524 /* no overflow check, because we know that the space is enough */
2525 *str++ = (char)c;
2526 ++p;
2527 }
2528 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002529 Py_ssize_t unicodepos = p-startp;
2530 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002532 Py_ssize_t repsize;
2533 Py_ssize_t newpos;
2534 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002535 Py_UNICODE *uni2;
2536 /* startpos for collecting unencodable chars */
2537 const Py_UNICODE *collstart = p;
2538 const Py_UNICODE *collend = p;
2539 /* find all unecodable characters */
2540 while ((collend < endp) && ((*collend)>=limit))
2541 ++collend;
2542 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2543 if (known_errorHandler==-1) {
2544 if ((errors==NULL) || (!strcmp(errors, "strict")))
2545 known_errorHandler = 1;
2546 else if (!strcmp(errors, "replace"))
2547 known_errorHandler = 2;
2548 else if (!strcmp(errors, "ignore"))
2549 known_errorHandler = 3;
2550 else if (!strcmp(errors, "xmlcharrefreplace"))
2551 known_errorHandler = 4;
2552 else
2553 known_errorHandler = 0;
2554 }
2555 switch (known_errorHandler) {
2556 case 1: /* strict */
2557 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2558 goto onError;
2559 case 2: /* replace */
2560 while (collstart++<collend)
2561 *str++ = '?'; /* fall through */
2562 case 3: /* ignore */
2563 p = collend;
2564 break;
2565 case 4: /* xmlcharrefreplace */
2566 respos = str-PyString_AS_STRING(res);
2567 /* determine replacement size (temporarily (mis)uses p) */
2568 for (p = collstart, repsize = 0; p < collend; ++p) {
2569 if (*p<10)
2570 repsize += 2+1+1;
2571 else if (*p<100)
2572 repsize += 2+2+1;
2573 else if (*p<1000)
2574 repsize += 2+3+1;
2575 else if (*p<10000)
2576 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002577#ifndef Py_UNICODE_WIDE
2578 else
2579 repsize += 2+5+1;
2580#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002581 else if (*p<100000)
2582 repsize += 2+5+1;
2583 else if (*p<1000000)
2584 repsize += 2+6+1;
2585 else
2586 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002587#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 }
2589 requiredsize = respos+repsize+(endp-collend);
2590 if (requiredsize > ressize) {
2591 if (requiredsize<2*ressize)
2592 requiredsize = 2*ressize;
2593 if (_PyString_Resize(&res, requiredsize))
2594 goto onError;
2595 str = PyString_AS_STRING(res) + respos;
2596 ressize = requiredsize;
2597 }
2598 /* generate replacement (temporarily (mis)uses p) */
2599 for (p = collstart; p < collend; ++p) {
2600 str += sprintf(str, "&#%d;", (int)*p);
2601 }
2602 p = collend;
2603 break;
2604 default:
2605 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2606 encoding, reason, startp, size, &exc,
2607 collstart-startp, collend-startp, &newpos);
2608 if (repunicode == NULL)
2609 goto onError;
2610 /* need more space? (at least enough for what we
2611 have+the replacement+the rest of the string, so
2612 we won't have to check space for encodable characters) */
2613 respos = str-PyString_AS_STRING(res);
2614 repsize = PyUnicode_GET_SIZE(repunicode);
2615 requiredsize = respos+repsize+(endp-collend);
2616 if (requiredsize > ressize) {
2617 if (requiredsize<2*ressize)
2618 requiredsize = 2*ressize;
2619 if (_PyString_Resize(&res, requiredsize)) {
2620 Py_DECREF(repunicode);
2621 goto onError;
2622 }
2623 str = PyString_AS_STRING(res) + respos;
2624 ressize = requiredsize;
2625 }
2626 /* check if there is anything unencodable in the replacement
2627 and copy it to the output */
2628 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2629 c = *uni2;
2630 if (c >= limit) {
2631 raise_encode_exception(&exc, encoding, startp, size,
2632 unicodepos, unicodepos+1, reason);
2633 Py_DECREF(repunicode);
2634 goto onError;
2635 }
2636 *str = (char)c;
2637 }
2638 p = startp + newpos;
2639 Py_DECREF(repunicode);
2640 }
2641 }
2642 }
2643 /* Resize if we allocated to much */
2644 respos = str-PyString_AS_STRING(res);
2645 if (respos<ressize)
2646 /* If this falls res will be NULL */
2647 _PyString_Resize(&res, respos);
2648 Py_XDECREF(errorHandler);
2649 Py_XDECREF(exc);
2650 return res;
2651
2652 onError:
2653 Py_XDECREF(res);
2654 Py_XDECREF(errorHandler);
2655 Py_XDECREF(exc);
2656 return NULL;
2657}
2658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002660 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 const char *errors)
2662{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664}
2665
2666PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2667{
2668 if (!PyUnicode_Check(unicode)) {
2669 PyErr_BadArgument();
2670 return NULL;
2671 }
2672 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2673 PyUnicode_GET_SIZE(unicode),
2674 NULL);
2675}
2676
2677/* --- 7-bit ASCII Codec -------------------------------------------------- */
2678
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002680 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 const char *errors)
2682{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 PyUnicodeObject *v;
2685 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002686 Py_ssize_t startinpos;
2687 Py_ssize_t endinpos;
2688 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002689 const char *e;
2690 PyObject *errorHandler = NULL;
2691 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002692
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002694 if (size == 1 && *(unsigned char*)s < 128) {
2695 Py_UNICODE r = *(unsigned char*)s;
2696 return PyUnicode_FromUnicode(&r, 1);
2697 }
Tim Petersced69f82003-09-16 20:30:58 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 v = _PyUnicode_New(size);
2700 if (v == NULL)
2701 goto onError;
2702 if (size == 0)
2703 return (PyObject *)v;
2704 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 e = s + size;
2706 while (s < e) {
2707 register unsigned char c = (unsigned char)*s;
2708 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 ++s;
2711 }
2712 else {
2713 startinpos = s-starts;
2714 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002715 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 if (unicode_decode_call_errorhandler(
2717 errors, &errorHandler,
2718 "ascii", "ordinal not in range(128)",
2719 starts, size, &startinpos, &endinpos, &exc, &s,
2720 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002724 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002725 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002726 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 Py_XDECREF(errorHandler);
2728 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002730
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 onError:
2732 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 Py_XDECREF(errorHandler);
2734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 return NULL;
2736}
2737
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002739 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 const char *errors)
2741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743}
2744
2745PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2746{
2747 if (!PyUnicode_Check(unicode)) {
2748 PyErr_BadArgument();
2749 return NULL;
2750 }
2751 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2752 PyUnicode_GET_SIZE(unicode),
2753 NULL);
2754}
2755
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002756#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002757
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002758/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002759
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002760PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002761 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002762 const char *errors)
2763{
2764 PyUnicodeObject *v;
2765 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002766 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002767
2768 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002769 assert(size < INT_MAX);
2770 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002771 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002772 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2773
2774 v = _PyUnicode_New(usize);
2775 if (v == NULL)
2776 return NULL;
2777 if (usize == 0)
2778 return (PyObject *)v;
2779 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002780 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002781 Py_DECREF(v);
2782 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2783 }
2784
2785 return (PyObject *)v;
2786}
2787
2788PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002789 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002790 const char *errors)
2791{
2792 PyObject *repr;
2793 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002794 DWORD mbcssize;
2795
2796 /* If there are no characters, bail now! */
2797 if (size==0)
2798 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002799
2800 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002801 assert(size<INT_MAX);
2802 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002803 if (mbcssize==0)
2804 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2805
2806 repr = PyString_FromStringAndSize(NULL, mbcssize);
2807 if (repr == NULL)
2808 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002809 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002810 return repr;
2811
2812 /* Do the conversion */
2813 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002814 assert(size < INT_MAX);
2815 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002816 Py_DECREF(repr);
2817 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2818 }
2819 return repr;
2820}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002821
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002822PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2823{
2824 if (!PyUnicode_Check(unicode)) {
2825 PyErr_BadArgument();
2826 return NULL;
2827 }
2828 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2829 PyUnicode_GET_SIZE(unicode),
2830 NULL);
2831}
2832
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002833#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002834
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835/* --- Character Mapping Codec -------------------------------------------- */
2836
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002838 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 PyObject *mapping,
2840 const char *errors)
2841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002843 Py_ssize_t startinpos;
2844 Py_ssize_t endinpos;
2845 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 PyUnicodeObject *v;
2848 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 PyObject *errorHandler = NULL;
2851 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002852 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002854
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 /* Default to Latin-1 */
2856 if (mapping == NULL)
2857 return PyUnicode_DecodeLatin1(s, size, errors);
2858
2859 v = _PyUnicode_New(size);
2860 if (v == NULL)
2861 goto onError;
2862 if (size == 0)
2863 return (PyObject *)v;
2864 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002866 if (PyUnicode_CheckExact(mapping)) {
2867 mapstring = PyUnicode_AS_UNICODE(mapping);
2868 maplen = PyUnicode_GET_SIZE(mapping);
2869 while (s < e) {
2870 unsigned char ch = *s;
2871 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002873 if (ch < maplen)
2874 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002876 if (x == 0xfffe) {
2877 /* undefined mapping */
2878 outpos = p-PyUnicode_AS_UNICODE(v);
2879 startinpos = s-starts;
2880 endinpos = startinpos+1;
2881 if (unicode_decode_call_errorhandler(
2882 errors, &errorHandler,
2883 "charmap", "character maps to <undefined>",
2884 starts, size, &startinpos, &endinpos, &exc, &s,
2885 (PyObject **)&v, &outpos, &p)) {
2886 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002887 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002888 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002889 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002890 *p++ = x;
2891 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002893 }
2894 else {
2895 while (s < e) {
2896 unsigned char ch = *s;
2897 PyObject *w, *x;
2898
2899 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2900 w = PyInt_FromLong((long)ch);
2901 if (w == NULL)
2902 goto onError;
2903 x = PyObject_GetItem(mapping, w);
2904 Py_DECREF(w);
2905 if (x == NULL) {
2906 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2907 /* No mapping found means: mapping is undefined. */
2908 PyErr_Clear();
2909 x = Py_None;
2910 Py_INCREF(x);
2911 } else
2912 goto onError;
2913 }
2914
2915 /* Apply mapping */
2916 if (PyInt_Check(x)) {
2917 long value = PyInt_AS_LONG(x);
2918 if (value < 0 || value > 65535) {
2919 PyErr_SetString(PyExc_TypeError,
2920 "character mapping must be in range(65536)");
2921 Py_DECREF(x);
2922 goto onError;
2923 }
2924 *p++ = (Py_UNICODE)value;
2925 }
2926 else if (x == Py_None) {
2927 /* undefined mapping */
2928 outpos = p-PyUnicode_AS_UNICODE(v);
2929 startinpos = s-starts;
2930 endinpos = startinpos+1;
2931 if (unicode_decode_call_errorhandler(
2932 errors, &errorHandler,
2933 "charmap", "character maps to <undefined>",
2934 starts, size, &startinpos, &endinpos, &exc, &s,
2935 (PyObject **)&v, &outpos, &p)) {
2936 Py_DECREF(x);
2937 goto onError;
2938 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002939 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002940 continue;
2941 }
2942 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002943 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002944
2945 if (targetsize == 1)
2946 /* 1-1 mapping */
2947 *p++ = *PyUnicode_AS_UNICODE(x);
2948
2949 else if (targetsize > 1) {
2950 /* 1-n mapping */
2951 if (targetsize > extrachars) {
2952 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002953 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
2954 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002955 (targetsize << 2);
2956 extrachars += needed;
2957 if (_PyUnicode_Resize(&v,
2958 PyUnicode_GET_SIZE(v) + needed) < 0) {
2959 Py_DECREF(x);
2960 goto onError;
2961 }
2962 p = PyUnicode_AS_UNICODE(v) + oldpos;
2963 }
2964 Py_UNICODE_COPY(p,
2965 PyUnicode_AS_UNICODE(x),
2966 targetsize);
2967 p += targetsize;
2968 extrachars -= targetsize;
2969 }
2970 /* 1-0 mapping: skip the character */
2971 }
2972 else {
2973 /* wrong return value */
2974 PyErr_SetString(PyExc_TypeError,
2975 "character mapping must return integer, None or unicode");
2976 Py_DECREF(x);
2977 goto onError;
2978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002980 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
2983 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002984 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 Py_XDECREF(errorHandler);
2987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002989
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002991 Py_XDECREF(errorHandler);
2992 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 Py_XDECREF(v);
2994 return NULL;
2995}
2996
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997/* Lookup the character ch in the mapping. If the character
2998 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00002999 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003000static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002 PyObject *w = PyInt_FromLong((long)c);
3003 PyObject *x;
3004
3005 if (w == NULL)
3006 return NULL;
3007 x = PyObject_GetItem(mapping, w);
3008 Py_DECREF(w);
3009 if (x == NULL) {
3010 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3011 /* No mapping found means: mapping is undefined. */
3012 PyErr_Clear();
3013 x = Py_None;
3014 Py_INCREF(x);
3015 return x;
3016 } else
3017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003019 else if (x == Py_None)
3020 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 else if (PyInt_Check(x)) {
3022 long value = PyInt_AS_LONG(x);
3023 if (value < 0 || value > 255) {
3024 PyErr_SetString(PyExc_TypeError,
3025 "character mapping must be in range(256)");
3026 Py_DECREF(x);
3027 return NULL;
3028 }
3029 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 else if (PyString_Check(x))
3032 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034 /* wrong return value */
3035 PyErr_SetString(PyExc_TypeError,
3036 "character mapping must return integer, None or str");
3037 Py_DECREF(x);
3038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
3040}
3041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042/* lookup the character, put the result in the output string and adjust
3043 various state variables. Reallocate the output string if not enough
3044 space is available. Return a new reference to the object that
3045 was put in the output buffer, or Py_None, if the mapping was undefined
3046 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003047 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048static
3049PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003050 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051{
3052 PyObject *rep = charmapencode_lookup(c, mapping);
3053
3054 if (rep==NULL)
3055 return NULL;
3056 else if (rep==Py_None)
3057 return rep;
3058 else {
3059 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003062 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 if (outsize<requiredsize) {
3064 /* exponentially overallocate to minimize reallocations */
3065 if (requiredsize < 2*outsize)
3066 requiredsize = 2*outsize;
3067 if (_PyString_Resize(outobj, requiredsize)) {
3068 Py_DECREF(rep);
3069 return NULL;
3070 }
3071 outstart = PyString_AS_STRING(*outobj);
3072 }
3073 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3074 }
3075 else {
3076 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003077 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3078 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 if (outsize<requiredsize) {
3080 /* exponentially overallocate to minimize reallocations */
3081 if (requiredsize < 2*outsize)
3082 requiredsize = 2*outsize;
3083 if (_PyString_Resize(outobj, requiredsize)) {
3084 Py_DECREF(rep);
3085 return NULL;
3086 }
3087 outstart = PyString_AS_STRING(*outobj);
3088 }
3089 memcpy(outstart + *outpos, repchars, repsize);
3090 *outpos += repsize;
3091 }
3092 }
3093 return rep;
3094}
3095
3096/* handle an error in PyUnicode_EncodeCharmap
3097 Return 0 on success, -1 on error */
3098static
3099int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003100 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003102 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003103 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003104{
3105 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003106 Py_ssize_t repsize;
3107 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 Py_UNICODE *uni2;
3109 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003110 Py_ssize_t collstartpos = *inpos;
3111 Py_ssize_t collendpos = *inpos+1;
3112 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003113 char *encoding = "charmap";
3114 char *reason = "character maps to <undefined>";
3115
3116 PyObject *x;
3117 /* find all unencodable characters */
3118 while (collendpos < size) {
3119 x = charmapencode_lookup(p[collendpos], mapping);
3120 if (x==NULL)
3121 return -1;
3122 else if (x!=Py_None) {
3123 Py_DECREF(x);
3124 break;
3125 }
3126 Py_DECREF(x);
3127 ++collendpos;
3128 }
3129 /* cache callback name lookup
3130 * (if not done yet, i.e. it's the first error) */
3131 if (*known_errorHandler==-1) {
3132 if ((errors==NULL) || (!strcmp(errors, "strict")))
3133 *known_errorHandler = 1;
3134 else if (!strcmp(errors, "replace"))
3135 *known_errorHandler = 2;
3136 else if (!strcmp(errors, "ignore"))
3137 *known_errorHandler = 3;
3138 else if (!strcmp(errors, "xmlcharrefreplace"))
3139 *known_errorHandler = 4;
3140 else
3141 *known_errorHandler = 0;
3142 }
3143 switch (*known_errorHandler) {
3144 case 1: /* strict */
3145 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3146 return -1;
3147 case 2: /* replace */
3148 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3149 x = charmapencode_output('?', mapping, res, respos);
3150 if (x==NULL) {
3151 return -1;
3152 }
3153 else if (x==Py_None) {
3154 Py_DECREF(x);
3155 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3156 return -1;
3157 }
3158 Py_DECREF(x);
3159 }
3160 /* fall through */
3161 case 3: /* ignore */
3162 *inpos = collendpos;
3163 break;
3164 case 4: /* xmlcharrefreplace */
3165 /* generate replacement (temporarily (mis)uses p) */
3166 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3167 char buffer[2+29+1+1];
3168 char *cp;
3169 sprintf(buffer, "&#%d;", (int)p[collpos]);
3170 for (cp = buffer; *cp; ++cp) {
3171 x = charmapencode_output(*cp, mapping, res, respos);
3172 if (x==NULL)
3173 return -1;
3174 else if (x==Py_None) {
3175 Py_DECREF(x);
3176 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3177 return -1;
3178 }
3179 Py_DECREF(x);
3180 }
3181 }
3182 *inpos = collendpos;
3183 break;
3184 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003185 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 encoding, reason, p, size, exceptionObject,
3187 collstartpos, collendpos, &newpos);
3188 if (repunicode == NULL)
3189 return -1;
3190 /* generate replacement */
3191 repsize = PyUnicode_GET_SIZE(repunicode);
3192 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3193 x = charmapencode_output(*uni2, mapping, res, respos);
3194 if (x==NULL) {
3195 Py_DECREF(repunicode);
3196 return -1;
3197 }
3198 else if (x==Py_None) {
3199 Py_DECREF(repunicode);
3200 Py_DECREF(x);
3201 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3202 return -1;
3203 }
3204 Py_DECREF(x);
3205 }
3206 *inpos = newpos;
3207 Py_DECREF(repunicode);
3208 }
3209 return 0;
3210}
3211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003213 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 PyObject *mapping,
3215 const char *errors)
3216{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 /* output object */
3218 PyObject *res = NULL;
3219 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003220 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003222 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 PyObject *errorHandler = NULL;
3224 PyObject *exc = NULL;
3225 /* the following variable is used for caching string comparisons
3226 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3227 * 3=ignore, 4=xmlcharrefreplace */
3228 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229
3230 /* Default to Latin-1 */
3231 if (mapping == NULL)
3232 return PyUnicode_EncodeLatin1(p, size, errors);
3233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 /* allocate enough for a simple encoding without
3235 replacements, if we need more, we'll resize */
3236 res = PyString_FromStringAndSize(NULL, size);
3237 if (res == NULL)
3238 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003239 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242 while (inpos<size) {
3243 /* try to encode it */
3244 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3245 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 if (x==Py_None) { /* unencodable character */
3248 if (charmap_encoding_error(p, size, &inpos, mapping,
3249 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003250 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003251 &res, &respos)) {
3252 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003253 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 else
3257 /* done with this character => adjust input position */
3258 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 Py_DECREF(x);
3260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 /* Resize if we allocated to much */
3263 if (respos<PyString_GET_SIZE(res)) {
3264 if (_PyString_Resize(&res, respos))
3265 goto onError;
3266 }
3267 Py_XDECREF(exc);
3268 Py_XDECREF(errorHandler);
3269 return res;
3270
3271 onError:
3272 Py_XDECREF(res);
3273 Py_XDECREF(exc);
3274 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 return NULL;
3276}
3277
3278PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3279 PyObject *mapping)
3280{
3281 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3282 PyErr_BadArgument();
3283 return NULL;
3284 }
3285 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3286 PyUnicode_GET_SIZE(unicode),
3287 mapping,
3288 NULL);
3289}
3290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291/* create or adjust a UnicodeTranslateError */
3292static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003293 const Py_UNICODE *unicode, Py_ssize_t size,
3294 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 if (*exceptionObject == NULL) {
3298 *exceptionObject = PyUnicodeTranslateError_Create(
3299 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 }
3301 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003302 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3303 goto onError;
3304 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3305 goto onError;
3306 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3307 goto onError;
3308 return;
3309 onError:
3310 Py_DECREF(*exceptionObject);
3311 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 }
3313}
3314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315/* raises a UnicodeTranslateError */
3316static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003317 const Py_UNICODE *unicode, Py_ssize_t size,
3318 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 const char *reason)
3320{
3321 make_translate_exception(exceptionObject,
3322 unicode, size, startpos, endpos, reason);
3323 if (*exceptionObject != NULL)
3324 PyCodec_StrictErrors(*exceptionObject);
3325}
3326
3327/* error handling callback helper:
3328 build arguments, call the callback and check the arguments,
3329 put the result into newpos and return the replacement string, which
3330 has to be freed by the caller */
3331static PyObject *unicode_translate_call_errorhandler(const char *errors,
3332 PyObject **errorHandler,
3333 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003334 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3335 Py_ssize_t startpos, Py_ssize_t endpos,
3336 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337{
3338 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3339
Martin v. Löwis18e16552006-02-15 17:27:45 +00003340 int i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 PyObject *restuple;
3342 PyObject *resunicode;
3343
3344 if (*errorHandler == NULL) {
3345 *errorHandler = PyCodec_LookupError(errors);
3346 if (*errorHandler == NULL)
3347 return NULL;
3348 }
3349
3350 make_translate_exception(exceptionObject,
3351 unicode, size, startpos, endpos, reason);
3352 if (*exceptionObject == NULL)
3353 return NULL;
3354
3355 restuple = PyObject_CallFunctionObjArgs(
3356 *errorHandler, *exceptionObject, NULL);
3357 if (restuple == NULL)
3358 return NULL;
3359 if (!PyTuple_Check(restuple)) {
3360 PyErr_Format(PyExc_TypeError, &argparse[4]);
3361 Py_DECREF(restuple);
3362 return NULL;
3363 }
3364 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003365 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 Py_DECREF(restuple);
3367 return NULL;
3368 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003369 if (i_newpos<0)
3370 *newpos = size+i_newpos;
3371 else
3372 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003373 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003374 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003375 Py_DECREF(restuple);
3376 return NULL;
3377 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 Py_INCREF(resunicode);
3379 Py_DECREF(restuple);
3380 return resunicode;
3381}
3382
3383/* Lookup the character ch in the mapping and put the result in result,
3384 which must be decrefed by the caller.
3385 Return 0 on success, -1 on error */
3386static
3387int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3388{
3389 PyObject *w = PyInt_FromLong((long)c);
3390 PyObject *x;
3391
3392 if (w == NULL)
3393 return -1;
3394 x = PyObject_GetItem(mapping, w);
3395 Py_DECREF(w);
3396 if (x == NULL) {
3397 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3398 /* No mapping found means: use 1:1 mapping. */
3399 PyErr_Clear();
3400 *result = NULL;
3401 return 0;
3402 } else
3403 return -1;
3404 }
3405 else if (x == Py_None) {
3406 *result = x;
3407 return 0;
3408 }
3409 else if (PyInt_Check(x)) {
3410 long value = PyInt_AS_LONG(x);
3411 long max = PyUnicode_GetMax();
3412 if (value < 0 || value > max) {
3413 PyErr_Format(PyExc_TypeError,
3414 "character mapping must be in range(0x%lx)", max+1);
3415 Py_DECREF(x);
3416 return -1;
3417 }
3418 *result = x;
3419 return 0;
3420 }
3421 else if (PyUnicode_Check(x)) {
3422 *result = x;
3423 return 0;
3424 }
3425 else {
3426 /* wrong return value */
3427 PyErr_SetString(PyExc_TypeError,
3428 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003429 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 return -1;
3431 }
3432}
3433/* ensure that *outobj is at least requiredsize characters long,
3434if not reallocate and adjust various state variables.
3435Return 0 on success, -1 on error */
3436static
Walter Dörwald4894c302003-10-24 14:25:28 +00003437int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003438 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003440 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003441 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003443 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003445 if (requiredsize < 2 * oldsize)
3446 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003447 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 return -1;
3449 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450 }
3451 return 0;
3452}
3453/* lookup the character, put the result in the output string and adjust
3454 various state variables. Return a new reference to the object that
3455 was put in the output buffer in *result, or Py_None, if the mapping was
3456 undefined (in which case no character was written).
3457 The called must decref result.
3458 Return 0 on success, -1 on error. */
3459static
Walter Dörwald4894c302003-10-24 14:25:28 +00003460int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003461 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003462 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463{
Walter Dörwald4894c302003-10-24 14:25:28 +00003464 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 return -1;
3466 if (*res==NULL) {
3467 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003468 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 }
3470 else if (*res==Py_None)
3471 ;
3472 else if (PyInt_Check(*res)) {
3473 /* no overflow check, because we know that the space is enough */
3474 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3475 }
3476 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003477 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 if (repsize==1) {
3479 /* no overflow check, because we know that the space is enough */
3480 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3481 }
3482 else if (repsize!=0) {
3483 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003484 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003485 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003486 repsize - 1;
3487 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 return -1;
3489 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3490 *outp += repsize;
3491 }
3492 }
3493 else
3494 return -1;
3495 return 0;
3496}
3497
3498PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003499 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 PyObject *mapping,
3501 const char *errors)
3502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 /* output object */
3504 PyObject *res = NULL;
3505 /* pointers to the beginning and end+1 of input */
3506 const Py_UNICODE *startp = p;
3507 const Py_UNICODE *endp = p + size;
3508 /* pointer into the output */
3509 Py_UNICODE *str;
3510 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003511 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 char *reason = "character maps to <undefined>";
3513 PyObject *errorHandler = NULL;
3514 PyObject *exc = NULL;
3515 /* the following variable is used for caching string comparisons
3516 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3517 * 3=ignore, 4=xmlcharrefreplace */
3518 int known_errorHandler = -1;
3519
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 if (mapping == NULL) {
3521 PyErr_BadArgument();
3522 return NULL;
3523 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003524
3525 /* allocate enough for a simple 1:1 translation without
3526 replacements, if we need more, we'll resize */
3527 res = PyUnicode_FromUnicode(NULL, size);
3528 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003529 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 return res;
3532 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 while (p<endp) {
3535 /* try to encode it */
3536 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003537 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 goto onError;
3540 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003541 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 if (x!=Py_None) /* it worked => adjust input pointer */
3543 ++p;
3544 else { /* untranslatable character */
3545 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003546 Py_ssize_t repsize;
3547 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 Py_UNICODE *uni2;
3549 /* startpos for collecting untranslatable chars */
3550 const Py_UNICODE *collstart = p;
3551 const Py_UNICODE *collend = p+1;
3552 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 /* find all untranslatable characters */
3555 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003556 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 goto onError;
3558 Py_XDECREF(x);
3559 if (x!=Py_None)
3560 break;
3561 ++collend;
3562 }
3563 /* cache callback name lookup
3564 * (if not done yet, i.e. it's the first error) */
3565 if (known_errorHandler==-1) {
3566 if ((errors==NULL) || (!strcmp(errors, "strict")))
3567 known_errorHandler = 1;
3568 else if (!strcmp(errors, "replace"))
3569 known_errorHandler = 2;
3570 else if (!strcmp(errors, "ignore"))
3571 known_errorHandler = 3;
3572 else if (!strcmp(errors, "xmlcharrefreplace"))
3573 known_errorHandler = 4;
3574 else
3575 known_errorHandler = 0;
3576 }
3577 switch (known_errorHandler) {
3578 case 1: /* strict */
3579 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3580 goto onError;
3581 case 2: /* replace */
3582 /* No need to check for space, this is a 1:1 replacement */
3583 for (coll = collstart; coll<collend; ++coll)
3584 *str++ = '?';
3585 /* fall through */
3586 case 3: /* ignore */
3587 p = collend;
3588 break;
3589 case 4: /* xmlcharrefreplace */
3590 /* generate replacement (temporarily (mis)uses p) */
3591 for (p = collstart; p < collend; ++p) {
3592 char buffer[2+29+1+1];
3593 char *cp;
3594 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003595 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3597 goto onError;
3598 for (cp = buffer; *cp; ++cp)
3599 *str++ = *cp;
3600 }
3601 p = collend;
3602 break;
3603 default:
3604 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3605 reason, startp, size, &exc,
3606 collstart-startp, collend-startp, &newpos);
3607 if (repunicode == NULL)
3608 goto onError;
3609 /* generate replacement */
3610 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003611 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3613 Py_DECREF(repunicode);
3614 goto onError;
3615 }
3616 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3617 *str++ = *uni2;
3618 p = startp + newpos;
3619 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 }
3621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 /* Resize if we allocated to much */
3624 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003625 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003626 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003627 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 }
3629 Py_XDECREF(exc);
3630 Py_XDECREF(errorHandler);
3631 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 onError:
3634 Py_XDECREF(res);
3635 Py_XDECREF(exc);
3636 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 return NULL;
3638}
3639
3640PyObject *PyUnicode_Translate(PyObject *str,
3641 PyObject *mapping,
3642 const char *errors)
3643{
3644 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 str = PyUnicode_FromObject(str);
3647 if (str == NULL)
3648 goto onError;
3649 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3650 PyUnicode_GET_SIZE(str),
3651 mapping,
3652 errors);
3653 Py_DECREF(str);
3654 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 onError:
3657 Py_XDECREF(str);
3658 return NULL;
3659}
Tim Petersced69f82003-09-16 20:30:58 +00003660
Guido van Rossum9e896b32000-04-05 20:11:21 +00003661/* --- Decimal Encoder ---------------------------------------------------- */
3662
3663int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003664 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003665 char *output,
3666 const char *errors)
3667{
3668 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 PyObject *errorHandler = NULL;
3670 PyObject *exc = NULL;
3671 const char *encoding = "decimal";
3672 const char *reason = "invalid decimal Unicode string";
3673 /* the following variable is used for caching string comparisons
3674 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3675 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003676
3677 if (output == NULL) {
3678 PyErr_BadArgument();
3679 return -1;
3680 }
3681
3682 p = s;
3683 end = s + length;
3684 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003686 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003688 Py_ssize_t repsize;
3689 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 Py_UNICODE *uni2;
3691 Py_UNICODE *collstart;
3692 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003693
Guido van Rossum9e896b32000-04-05 20:11:21 +00003694 if (Py_UNICODE_ISSPACE(ch)) {
3695 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003697 continue;
3698 }
3699 decimal = Py_UNICODE_TODECIMAL(ch);
3700 if (decimal >= 0) {
3701 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003703 continue;
3704 }
Guido van Rossumba477042000-04-06 18:18:10 +00003705 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003706 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003708 continue;
3709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 /* All other characters are considered unencodable */
3711 collstart = p;
3712 collend = p+1;
3713 while (collend < end) {
3714 if ((0 < *collend && *collend < 256) ||
3715 !Py_UNICODE_ISSPACE(*collend) ||
3716 Py_UNICODE_TODECIMAL(*collend))
3717 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 /* cache callback name lookup
3720 * (if not done yet, i.e. it's the first error) */
3721 if (known_errorHandler==-1) {
3722 if ((errors==NULL) || (!strcmp(errors, "strict")))
3723 known_errorHandler = 1;
3724 else if (!strcmp(errors, "replace"))
3725 known_errorHandler = 2;
3726 else if (!strcmp(errors, "ignore"))
3727 known_errorHandler = 3;
3728 else if (!strcmp(errors, "xmlcharrefreplace"))
3729 known_errorHandler = 4;
3730 else
3731 known_errorHandler = 0;
3732 }
3733 switch (known_errorHandler) {
3734 case 1: /* strict */
3735 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3736 goto onError;
3737 case 2: /* replace */
3738 for (p = collstart; p < collend; ++p)
3739 *output++ = '?';
3740 /* fall through */
3741 case 3: /* ignore */
3742 p = collend;
3743 break;
3744 case 4: /* xmlcharrefreplace */
3745 /* generate replacement (temporarily (mis)uses p) */
3746 for (p = collstart; p < collend; ++p)
3747 output += sprintf(output, "&#%d;", (int)*p);
3748 p = collend;
3749 break;
3750 default:
3751 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3752 encoding, reason, s, length, &exc,
3753 collstart-s, collend-s, &newpos);
3754 if (repunicode == NULL)
3755 goto onError;
3756 /* generate replacement */
3757 repsize = PyUnicode_GET_SIZE(repunicode);
3758 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3759 Py_UNICODE ch = *uni2;
3760 if (Py_UNICODE_ISSPACE(ch))
3761 *output++ = ' ';
3762 else {
3763 decimal = Py_UNICODE_TODECIMAL(ch);
3764 if (decimal >= 0)
3765 *output++ = '0' + decimal;
3766 else if (0 < ch && ch < 256)
3767 *output++ = (char)ch;
3768 else {
3769 Py_DECREF(repunicode);
3770 raise_encode_exception(&exc, encoding,
3771 s, length, collstart-s, collend-s, reason);
3772 goto onError;
3773 }
3774 }
3775 }
3776 p = s + newpos;
3777 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003778 }
3779 }
3780 /* 0-terminate the output string */
3781 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 Py_XDECREF(exc);
3783 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003784 return 0;
3785
3786 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 Py_XDECREF(exc);
3788 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003789 return -1;
3790}
3791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792/* --- Helpers ------------------------------------------------------------ */
3793
Tim Petersced69f82003-09-16 20:30:58 +00003794static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003795Py_ssize_t count(PyUnicodeObject *self,
3796 Py_ssize_t start,
3797 Py_ssize_t end,
3798 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799{
3800 int count = 0;
3801
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003802 if (start < 0)
3803 start += self->length;
3804 if (start < 0)
3805 start = 0;
3806 if (end > self->length)
3807 end = self->length;
3808 if (end < 0)
3809 end += self->length;
3810 if (end < 0)
3811 end = 0;
3812
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003813 if (substring->length == 0)
3814 return (end - start + 1);
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 end -= substring->length;
3817
3818 while (start <= end)
3819 if (Py_UNICODE_MATCH(self, start, substring)) {
3820 count++;
3821 start += substring->length;
3822 } else
3823 start++;
3824
3825 return count;
3826}
3827
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003830 Py_ssize_t start,
3831 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003833 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003834
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 str = PyUnicode_FromObject(str);
3836 if (str == NULL)
3837 return -1;
3838 substr = PyUnicode_FromObject(substr);
3839 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003840 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 return -1;
3842 }
Tim Petersced69f82003-09-16 20:30:58 +00003843
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 result = count((PyUnicodeObject *)str,
3845 start, end,
3846 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003847
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848 Py_DECREF(str);
3849 Py_DECREF(substr);
3850 return result;
3851}
3852
Tim Petersced69f82003-09-16 20:30:58 +00003853static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003856 Py_ssize_t start,
3857 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 int direction)
3859{
3860 if (start < 0)
3861 start += self->length;
3862 if (start < 0)
3863 start = 0;
3864
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 if (end > self->length)
3866 end = self->length;
3867 if (end < 0)
3868 end += self->length;
3869 if (end < 0)
3870 end = 0;
3871
Guido van Rossum76afbd92002-08-20 17:29:29 +00003872 if (substring->length == 0)
3873 return (direction > 0) ? start : end;
3874
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875 end -= substring->length;
3876
3877 if (direction < 0) {
3878 for (; end >= start; end--)
3879 if (Py_UNICODE_MATCH(self, end, substring))
3880 return end;
3881 } else {
3882 for (; start <= end; start++)
3883 if (Py_UNICODE_MATCH(self, start, substring))
3884 return start;
3885 }
3886
3887 return -1;
3888}
3889
Martin v. Löwis18e16552006-02-15 17:27:45 +00003890Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003892 Py_ssize_t start,
3893 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 int direction)
3895{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003896 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003897
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 str = PyUnicode_FromObject(str);
3899 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003900 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 substr = PyUnicode_FromObject(substr);
3902 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003903 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003904 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905 }
Tim Petersced69f82003-09-16 20:30:58 +00003906
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 result = findstring((PyUnicodeObject *)str,
3908 (PyUnicodeObject *)substr,
3909 start, end, direction);
3910 Py_DECREF(str);
3911 Py_DECREF(substr);
3912 return result;
3913}
3914
Tim Petersced69f82003-09-16 20:30:58 +00003915static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916int tailmatch(PyUnicodeObject *self,
3917 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003918 Py_ssize_t start,
3919 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 int direction)
3921{
3922 if (start < 0)
3923 start += self->length;
3924 if (start < 0)
3925 start = 0;
3926
3927 if (substring->length == 0)
3928 return 1;
3929
3930 if (end > self->length)
3931 end = self->length;
3932 if (end < 0)
3933 end += self->length;
3934 if (end < 0)
3935 end = 0;
3936
3937 end -= substring->length;
3938 if (end < start)
3939 return 0;
3940
3941 if (direction > 0) {
3942 if (Py_UNICODE_MATCH(self, end, substring))
3943 return 1;
3944 } else {
3945 if (Py_UNICODE_MATCH(self, start, substring))
3946 return 1;
3947 }
3948
3949 return 0;
3950}
3951
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t start,
3955 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 int direction)
3957{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003958 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003959
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 str = PyUnicode_FromObject(str);
3961 if (str == NULL)
3962 return -1;
3963 substr = PyUnicode_FromObject(substr);
3964 if (substr == NULL) {
3965 Py_DECREF(substr);
3966 return -1;
3967 }
Tim Petersced69f82003-09-16 20:30:58 +00003968
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 result = tailmatch((PyUnicodeObject *)str,
3970 (PyUnicodeObject *)substr,
3971 start, end, direction);
3972 Py_DECREF(str);
3973 Py_DECREF(substr);
3974 return result;
3975}
3976
Tim Petersced69f82003-09-16 20:30:58 +00003977static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003979 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 Py_UNICODE ch)
3981{
3982 /* like wcschr, but doesn't stop at NULL characters */
3983
3984 while (size-- > 0) {
3985 if (*s == ch)
3986 return s;
3987 s++;
3988 }
3989
3990 return NULL;
3991}
3992
3993/* Apply fixfct filter to the Unicode object self and return a
3994 reference to the modified object */
3995
Tim Petersced69f82003-09-16 20:30:58 +00003996static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997PyObject *fixup(PyUnicodeObject *self,
3998 int (*fixfct)(PyUnicodeObject *s))
3999{
4000
4001 PyUnicodeObject *u;
4002
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004003 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 if (u == NULL)
4005 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004006
4007 Py_UNICODE_COPY(u->str, self->str, self->length);
4008
Tim Peters7a29bd52001-09-12 03:03:31 +00004009 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 /* fixfct should return TRUE if it modified the buffer. If
4011 FALSE, return a reference to the original buffer instead
4012 (to save space, not time) */
4013 Py_INCREF(self);
4014 Py_DECREF(u);
4015 return (PyObject*) self;
4016 }
4017 return (PyObject*) u;
4018}
4019
Tim Petersced69f82003-09-16 20:30:58 +00004020static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021int fixupper(PyUnicodeObject *self)
4022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004023 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 Py_UNICODE *s = self->str;
4025 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004026
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 while (len-- > 0) {
4028 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004029
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 ch = Py_UNICODE_TOUPPER(*s);
4031 if (ch != *s) {
4032 status = 1;
4033 *s = ch;
4034 }
4035 s++;
4036 }
4037
4038 return status;
4039}
4040
Tim Petersced69f82003-09-16 20:30:58 +00004041static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042int fixlower(PyUnicodeObject *self)
4043{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004044 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 Py_UNICODE *s = self->str;
4046 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004047
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 while (len-- > 0) {
4049 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 ch = Py_UNICODE_TOLOWER(*s);
4052 if (ch != *s) {
4053 status = 1;
4054 *s = ch;
4055 }
4056 s++;
4057 }
4058
4059 return status;
4060}
4061
Tim Petersced69f82003-09-16 20:30:58 +00004062static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063int fixswapcase(PyUnicodeObject *self)
4064{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 Py_UNICODE *s = self->str;
4067 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004068
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 while (len-- > 0) {
4070 if (Py_UNICODE_ISUPPER(*s)) {
4071 *s = Py_UNICODE_TOLOWER(*s);
4072 status = 1;
4073 } else if (Py_UNICODE_ISLOWER(*s)) {
4074 *s = Py_UNICODE_TOUPPER(*s);
4075 status = 1;
4076 }
4077 s++;
4078 }
4079
4080 return status;
4081}
4082
Tim Petersced69f82003-09-16 20:30:58 +00004083static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084int fixcapitalize(PyUnicodeObject *self)
4085{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004086 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004087 Py_UNICODE *s = self->str;
4088 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004089
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004090 if (len == 0)
4091 return 0;
4092 if (Py_UNICODE_ISLOWER(*s)) {
4093 *s = Py_UNICODE_TOUPPER(*s);
4094 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004096 s++;
4097 while (--len > 0) {
4098 if (Py_UNICODE_ISUPPER(*s)) {
4099 *s = Py_UNICODE_TOLOWER(*s);
4100 status = 1;
4101 }
4102 s++;
4103 }
4104 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105}
4106
4107static
4108int fixtitle(PyUnicodeObject *self)
4109{
4110 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4111 register Py_UNICODE *e;
4112 int previous_is_cased;
4113
4114 /* Shortcut for single character strings */
4115 if (PyUnicode_GET_SIZE(self) == 1) {
4116 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4117 if (*p != ch) {
4118 *p = ch;
4119 return 1;
4120 }
4121 else
4122 return 0;
4123 }
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 e = p + PyUnicode_GET_SIZE(self);
4126 previous_is_cased = 0;
4127 for (; p < e; p++) {
4128 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 if (previous_is_cased)
4131 *p = Py_UNICODE_TOLOWER(ch);
4132 else
4133 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004134
4135 if (Py_UNICODE_ISLOWER(ch) ||
4136 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 Py_UNICODE_ISTITLE(ch))
4138 previous_is_cased = 1;
4139 else
4140 previous_is_cased = 0;
4141 }
4142 return 1;
4143}
4144
Tim Peters8ce9f162004-08-27 01:49:32 +00004145PyObject *
4146PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147{
Tim Peters8ce9f162004-08-27 01:49:32 +00004148 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004149 const Py_UNICODE blank = ' ';
4150 const Py_UNICODE *sep = &blank;
4151 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004152 PyUnicodeObject *res = NULL; /* the result */
4153 size_t res_alloc = 100; /* # allocated bytes for string in res */
4154 size_t res_used; /* # used bytes */
4155 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4156 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004157 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004158 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 int i;
4160
Tim Peters05eba1f2004-08-27 21:32:02 +00004161 fseq = PySequence_Fast(seq, "");
4162 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004163 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004164 }
4165
Tim Peters91879ab2004-08-27 22:35:44 +00004166 /* Grrrr. A codec may be invoked to convert str objects to
4167 * Unicode, and so it's possible to call back into Python code
4168 * during PyUnicode_FromObject(), and so it's possible for a sick
4169 * codec to change the size of fseq (if seq is a list). Therefore
4170 * we have to keep refetching the size -- can't assume seqlen
4171 * is invariant.
4172 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004173 seqlen = PySequence_Fast_GET_SIZE(fseq);
4174 /* If empty sequence, return u"". */
4175 if (seqlen == 0) {
4176 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4177 goto Done;
4178 }
4179 /* If singleton sequence with an exact Unicode, return that. */
4180 if (seqlen == 1) {
4181 item = PySequence_Fast_GET_ITEM(fseq, 0);
4182 if (PyUnicode_CheckExact(item)) {
4183 Py_INCREF(item);
4184 res = (PyUnicodeObject *)item;
4185 goto Done;
4186 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004187 }
4188
Tim Peters05eba1f2004-08-27 21:32:02 +00004189 /* At least two items to join, or one that isn't exact Unicode. */
4190 if (seqlen > 1) {
4191 /* Set up sep and seplen -- they're needed. */
4192 if (separator == NULL) {
4193 sep = &blank;
4194 seplen = 1;
4195 }
4196 else {
4197 internal_separator = PyUnicode_FromObject(separator);
4198 if (internal_separator == NULL)
4199 goto onError;
4200 sep = PyUnicode_AS_UNICODE(internal_separator);
4201 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004202 /* In case PyUnicode_FromObject() mutated seq. */
4203 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004204 }
4205 }
4206
4207 /* Get space. */
4208 res = _PyUnicode_New((int)res_alloc);
4209 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004210 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004211 res_p = PyUnicode_AS_UNICODE(res);
4212 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004213
Tim Peters05eba1f2004-08-27 21:32:02 +00004214 for (i = 0; i < seqlen; ++i) {
4215 size_t itemlen;
4216 size_t new_res_used;
4217
4218 item = PySequence_Fast_GET_ITEM(fseq, i);
4219 /* Convert item to Unicode. */
4220 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4221 PyErr_Format(PyExc_TypeError,
4222 "sequence item %i: expected string or Unicode,"
4223 " %.80s found",
4224 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004225 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004226 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004227 item = PyUnicode_FromObject(item);
4228 if (item == NULL)
4229 goto onError;
4230 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004231
Tim Peters91879ab2004-08-27 22:35:44 +00004232 /* In case PyUnicode_FromObject() mutated seq. */
4233 seqlen = PySequence_Fast_GET_SIZE(fseq);
4234
Tim Peters8ce9f162004-08-27 01:49:32 +00004235 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004237 new_res_used = res_used + itemlen;
4238 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004239 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004240 if (i < seqlen - 1) {
4241 new_res_used += seplen;
4242 if (new_res_used < res_used || new_res_used > INT_MAX)
4243 goto Overflow;
4244 }
4245 if (new_res_used > res_alloc) {
4246 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004247 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004248 size_t oldsize = res_alloc;
4249 res_alloc += res_alloc;
4250 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004251 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004252 } while (new_res_used > res_alloc);
4253 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004254 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004256 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004257 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004259
4260 /* Copy item, and maybe the separator. */
4261 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4262 res_p += itemlen;
4263 if (i < seqlen - 1) {
4264 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4265 res_p += seplen;
4266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004268 res_used = new_res_used;
4269 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004270
Tim Peters05eba1f2004-08-27 21:32:02 +00004271 /* Shrink res to match the used area; this probably can't fail,
4272 * but it's cheap to check.
4273 */
4274 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004275 goto onError;
4276
4277 Done:
4278 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004279 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 return (PyObject *)res;
4281
Tim Peters8ce9f162004-08-27 01:49:32 +00004282 Overflow:
4283 PyErr_SetString(PyExc_OverflowError,
4284 "join() is too long for a Python string");
4285 Py_DECREF(item);
4286 /* fall through */
4287
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004289 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004290 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004291 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 return NULL;
4293}
4294
Tim Petersced69f82003-09-16 20:30:58 +00004295static
4296PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004297 Py_ssize_t left,
4298 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 Py_UNICODE fill)
4300{
4301 PyUnicodeObject *u;
4302
4303 if (left < 0)
4304 left = 0;
4305 if (right < 0)
4306 right = 0;
4307
Tim Peters7a29bd52001-09-12 03:03:31 +00004308 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 Py_INCREF(self);
4310 return self;
4311 }
4312
4313 u = _PyUnicode_New(left + self->length + right);
4314 if (u) {
4315 if (left)
4316 Py_UNICODE_FILL(u->str, fill, left);
4317 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4318 if (right)
4319 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4320 }
4321
4322 return u;
4323}
4324
4325#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004326 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 if (!str) \
4328 goto onError; \
4329 if (PyList_Append(list, str)) { \
4330 Py_DECREF(str); \
4331 goto onError; \
4332 } \
4333 else \
4334 Py_DECREF(str);
4335
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004336#define SPLIT_INSERT(data, left, right) \
4337 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4338 if (!str) \
4339 goto onError; \
4340 if (PyList_Insert(list, 0, str)) { \
4341 Py_DECREF(str); \
4342 goto onError; \
4343 } \
4344 else \
4345 Py_DECREF(str);
4346
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347static
4348PyObject *split_whitespace(PyUnicodeObject *self,
4349 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004350 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004352 register Py_ssize_t i;
4353 register Py_ssize_t j;
4354 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 PyObject *str;
4356
4357 for (i = j = 0; i < len; ) {
4358 /* find a token */
4359 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4360 i++;
4361 j = i;
4362 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4363 i++;
4364 if (j < i) {
4365 if (maxcount-- <= 0)
4366 break;
4367 SPLIT_APPEND(self->str, j, i);
4368 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4369 i++;
4370 j = i;
4371 }
4372 }
4373 if (j < len) {
4374 SPLIT_APPEND(self->str, j, len);
4375 }
4376 return list;
4377
4378 onError:
4379 Py_DECREF(list);
4380 return NULL;
4381}
4382
4383PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004384 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004386 register Py_ssize_t i;
4387 register Py_ssize_t j;
4388 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389 PyObject *list;
4390 PyObject *str;
4391 Py_UNICODE *data;
4392
4393 string = PyUnicode_FromObject(string);
4394 if (string == NULL)
4395 return NULL;
4396 data = PyUnicode_AS_UNICODE(string);
4397 len = PyUnicode_GET_SIZE(string);
4398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 list = PyList_New(0);
4400 if (!list)
4401 goto onError;
4402
4403 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004404 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004405
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406 /* Find a line and append it */
4407 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4408 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409
4410 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004411 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 if (i < len) {
4413 if (data[i] == '\r' && i + 1 < len &&
4414 data[i+1] == '\n')
4415 i += 2;
4416 else
4417 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004418 if (keepends)
4419 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 }
Guido van Rossum86662912000-04-11 15:38:46 +00004421 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 j = i;
4423 }
4424 if (j < len) {
4425 SPLIT_APPEND(data, j, len);
4426 }
4427
4428 Py_DECREF(string);
4429 return list;
4430
4431 onError:
4432 Py_DECREF(list);
4433 Py_DECREF(string);
4434 return NULL;
4435}
4436
Tim Petersced69f82003-09-16 20:30:58 +00004437static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438PyObject *split_char(PyUnicodeObject *self,
4439 PyObject *list,
4440 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004441 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004443 register Py_ssize_t i;
4444 register Py_ssize_t j;
4445 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 PyObject *str;
4447
4448 for (i = j = 0; i < len; ) {
4449 if (self->str[i] == ch) {
4450 if (maxcount-- <= 0)
4451 break;
4452 SPLIT_APPEND(self->str, j, i);
4453 i = j = i + 1;
4454 } else
4455 i++;
4456 }
4457 if (j <= len) {
4458 SPLIT_APPEND(self->str, j, len);
4459 }
4460 return list;
4461
4462 onError:
4463 Py_DECREF(list);
4464 return NULL;
4465}
4466
Tim Petersced69f82003-09-16 20:30:58 +00004467static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468PyObject *split_substring(PyUnicodeObject *self,
4469 PyObject *list,
4470 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004471 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004473 register Py_ssize_t i;
4474 register Py_ssize_t j;
4475 Py_ssize_t len = self->length;
4476 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 PyObject *str;
4478
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004479 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 if (Py_UNICODE_MATCH(self, i, substring)) {
4481 if (maxcount-- <= 0)
4482 break;
4483 SPLIT_APPEND(self->str, j, i);
4484 i = j = i + sublen;
4485 } else
4486 i++;
4487 }
4488 if (j <= len) {
4489 SPLIT_APPEND(self->str, j, len);
4490 }
4491 return list;
4492
4493 onError:
4494 Py_DECREF(list);
4495 return NULL;
4496}
4497
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004498static
4499PyObject *rsplit_whitespace(PyUnicodeObject *self,
4500 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004501 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 register Py_ssize_t i;
4504 register Py_ssize_t j;
4505 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004506 PyObject *str;
4507
4508 for (i = j = len - 1; i >= 0; ) {
4509 /* find a token */
4510 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4511 i--;
4512 j = i;
4513 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4514 i--;
4515 if (j > i) {
4516 if (maxcount-- <= 0)
4517 break;
4518 SPLIT_INSERT(self->str, i + 1, j + 1);
4519 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4520 i--;
4521 j = i;
4522 }
4523 }
4524 if (j >= 0) {
4525 SPLIT_INSERT(self->str, 0, j + 1);
4526 }
4527 return list;
4528
4529 onError:
4530 Py_DECREF(list);
4531 return NULL;
4532}
4533
4534static
4535PyObject *rsplit_char(PyUnicodeObject *self,
4536 PyObject *list,
4537 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004538 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004539{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004540 register Py_ssize_t i;
4541 register Py_ssize_t j;
4542 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004543 PyObject *str;
4544
4545 for (i = j = len - 1; i >= 0; ) {
4546 if (self->str[i] == ch) {
4547 if (maxcount-- <= 0)
4548 break;
4549 SPLIT_INSERT(self->str, i + 1, j + 1);
4550 j = i = i - 1;
4551 } else
4552 i--;
4553 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004554 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004555 SPLIT_INSERT(self->str, 0, j + 1);
4556 }
4557 return list;
4558
4559 onError:
4560 Py_DECREF(list);
4561 return NULL;
4562}
4563
4564static
4565PyObject *rsplit_substring(PyUnicodeObject *self,
4566 PyObject *list,
4567 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004568 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004569{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004570 register Py_ssize_t i;
4571 register Py_ssize_t j;
4572 Py_ssize_t len = self->length;
4573 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004574 PyObject *str;
4575
4576 for (i = len - sublen, j = len; i >= 0; ) {
4577 if (Py_UNICODE_MATCH(self, i, substring)) {
4578 if (maxcount-- <= 0)
4579 break;
4580 SPLIT_INSERT(self->str, i + sublen, j);
4581 j = i;
4582 i -= sublen;
4583 } else
4584 i--;
4585 }
4586 if (j >= 0) {
4587 SPLIT_INSERT(self->str, 0, j);
4588 }
4589 return list;
4590
4591 onError:
4592 Py_DECREF(list);
4593 return NULL;
4594}
4595
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004597#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598
4599static
4600PyObject *split(PyUnicodeObject *self,
4601 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004602 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603{
4604 PyObject *list;
4605
4606 if (maxcount < 0)
4607 maxcount = INT_MAX;
4608
4609 list = PyList_New(0);
4610 if (!list)
4611 return NULL;
4612
4613 if (substring == NULL)
4614 return split_whitespace(self,list,maxcount);
4615
4616 else if (substring->length == 1)
4617 return split_char(self,list,substring->str[0],maxcount);
4618
4619 else if (substring->length == 0) {
4620 Py_DECREF(list);
4621 PyErr_SetString(PyExc_ValueError, "empty separator");
4622 return NULL;
4623 }
4624 else
4625 return split_substring(self,list,substring,maxcount);
4626}
4627
Tim Petersced69f82003-09-16 20:30:58 +00004628static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004629PyObject *rsplit(PyUnicodeObject *self,
4630 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004631 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004632{
4633 PyObject *list;
4634
4635 if (maxcount < 0)
4636 maxcount = INT_MAX;
4637
4638 list = PyList_New(0);
4639 if (!list)
4640 return NULL;
4641
4642 if (substring == NULL)
4643 return rsplit_whitespace(self,list,maxcount);
4644
4645 else if (substring->length == 1)
4646 return rsplit_char(self,list,substring->str[0],maxcount);
4647
4648 else if (substring->length == 0) {
4649 Py_DECREF(list);
4650 PyErr_SetString(PyExc_ValueError, "empty separator");
4651 return NULL;
4652 }
4653 else
4654 return rsplit_substring(self,list,substring,maxcount);
4655}
4656
4657static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658PyObject *replace(PyUnicodeObject *self,
4659 PyUnicodeObject *str1,
4660 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004661 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662{
4663 PyUnicodeObject *u;
4664
4665 if (maxcount < 0)
4666 maxcount = INT_MAX;
4667
4668 if (str1->length == 1 && str2->length == 1) {
4669 int i;
4670
4671 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004672 if (!findchar(self->str, self->length, str1->str[0]) &&
4673 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 /* nothing to replace, return original string */
4675 Py_INCREF(self);
4676 u = self;
4677 } else {
4678 Py_UNICODE u1 = str1->str[0];
4679 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004680
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004682 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 self->length
4684 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004685 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004686 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004687 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 for (i = 0; i < u->length; i++)
4689 if (u->str[i] == u1) {
4690 if (--maxcount < 0)
4691 break;
4692 u->str[i] = u2;
4693 }
4694 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696
4697 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004698 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699 Py_UNICODE *p;
4700
4701 /* replace strings */
4702 n = count(self, 0, self->length, str1);
4703 if (n > maxcount)
4704 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004705 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004707 if (PyUnicode_CheckExact(self)) {
4708 Py_INCREF(self);
4709 u = self;
4710 }
4711 else {
4712 u = (PyUnicodeObject *)
4713 PyUnicode_FromUnicode(self->str, self->length);
4714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715 } else {
4716 u = _PyUnicode_New(
4717 self->length + n * (str2->length - str1->length));
4718 if (u) {
4719 i = 0;
4720 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004721 if (str1->length > 0) {
4722 while (i <= self->length - str1->length)
4723 if (Py_UNICODE_MATCH(self, i, str1)) {
4724 /* replace string segment */
4725 Py_UNICODE_COPY(p, str2->str, str2->length);
4726 p += str2->length;
4727 i += str1->length;
4728 if (--n <= 0) {
4729 /* copy remaining part */
4730 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4731 break;
4732 }
4733 } else
4734 *p++ = self->str[i++];
4735 } else {
4736 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 Py_UNICODE_COPY(p, str2->str, str2->length);
4738 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004739 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004742 }
4743 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 }
4746 }
4747 }
Tim Petersced69f82003-09-16 20:30:58 +00004748
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 return (PyObject *) u;
4750}
4751
4752/* --- Unicode Object Methods --------------------------------------------- */
4753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004754PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755"S.title() -> unicode\n\
4756\n\
4757Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004758characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759
4760static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004761unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 return fixup(self, fixtitle);
4764}
4765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004766PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767"S.capitalize() -> unicode\n\
4768\n\
4769Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004770have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771
4772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004773unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 return fixup(self, fixcapitalize);
4776}
4777
4778#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004779PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780"S.capwords() -> unicode\n\
4781\n\
4782Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004783normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784
4785static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004786unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787{
4788 PyObject *list;
4789 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004790 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 /* Split into words */
4793 list = split(self, NULL, -1);
4794 if (!list)
4795 return NULL;
4796
4797 /* Capitalize each word */
4798 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4799 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4800 fixcapitalize);
4801 if (item == NULL)
4802 goto onError;
4803 Py_DECREF(PyList_GET_ITEM(list, i));
4804 PyList_SET_ITEM(list, i, item);
4805 }
4806
4807 /* Join the words to form a new string */
4808 item = PyUnicode_Join(NULL, list);
4809
4810onError:
4811 Py_DECREF(list);
4812 return (PyObject *)item;
4813}
4814#endif
4815
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004816/* Argument converter. Coerces to a single unicode character */
4817
4818static int
4819convert_uc(PyObject *obj, void *addr)
4820{
4821 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4822 PyObject *uniobj;
4823 Py_UNICODE *unistr;
4824
4825 uniobj = PyUnicode_FromObject(obj);
4826 if (uniobj == NULL) {
4827 PyErr_SetString(PyExc_TypeError,
4828 "The fill character cannot be converted to Unicode");
4829 return 0;
4830 }
4831 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4832 PyErr_SetString(PyExc_TypeError,
4833 "The fill character must be exactly one character long");
4834 Py_DECREF(uniobj);
4835 return 0;
4836 }
4837 unistr = PyUnicode_AS_UNICODE(uniobj);
4838 *fillcharloc = unistr[0];
4839 Py_DECREF(uniobj);
4840 return 1;
4841}
4842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004843PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004844"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004846Return S centered in a Unicode string of length width. Padding is\n\
4847done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848
4849static PyObject *
4850unicode_center(PyUnicodeObject *self, PyObject *args)
4851{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t marg, left;
4853 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004854 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855
Thomas Woutersde017742006-02-16 19:34:37 +00004856 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 return NULL;
4858
Tim Peters7a29bd52001-09-12 03:03:31 +00004859 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 Py_INCREF(self);
4861 return (PyObject*) self;
4862 }
4863
4864 marg = width - self->length;
4865 left = marg / 2 + (marg & width & 1);
4866
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004867 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868}
4869
Marc-André Lemburge5034372000-08-08 08:04:29 +00004870#if 0
4871
4872/* This code should go into some future Unicode collation support
4873 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004874 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004875
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004876/* speedy UTF-16 code point order comparison */
4877/* gleaned from: */
4878/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4879
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004880static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004881{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004882 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004883 0, 0, 0, 0, 0, 0, 0, 0,
4884 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004885 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004886};
4887
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888static int
4889unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4890{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004891 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004892
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 Py_UNICODE *s1 = str1->str;
4894 Py_UNICODE *s2 = str2->str;
4895
4896 len1 = str1->length;
4897 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004898
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004900 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004901
4902 c1 = *s1++;
4903 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004904
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004905 if (c1 > (1<<11) * 26)
4906 c1 += utf16Fixup[c1>>11];
4907 if (c2 > (1<<11) * 26)
4908 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004909 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004910
4911 if (c1 != c2)
4912 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004913
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004914 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 }
4916
4917 return (len1 < len2) ? -1 : (len1 != len2);
4918}
4919
Marc-André Lemburge5034372000-08-08 08:04:29 +00004920#else
4921
4922static int
4923unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4924{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004925 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004926
4927 Py_UNICODE *s1 = str1->str;
4928 Py_UNICODE *s2 = str2->str;
4929
4930 len1 = str1->length;
4931 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004932
Marc-André Lemburge5034372000-08-08 08:04:29 +00004933 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004934 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004935
Fredrik Lundh45714e92001-06-26 16:39:36 +00004936 c1 = *s1++;
4937 c2 = *s2++;
4938
4939 if (c1 != c2)
4940 return (c1 < c2) ? -1 : 1;
4941
Marc-André Lemburge5034372000-08-08 08:04:29 +00004942 len1--; len2--;
4943 }
4944
4945 return (len1 < len2) ? -1 : (len1 != len2);
4946}
4947
4948#endif
4949
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950int PyUnicode_Compare(PyObject *left,
4951 PyObject *right)
4952{
4953 PyUnicodeObject *u = NULL, *v = NULL;
4954 int result;
4955
4956 /* Coerce the two arguments */
4957 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4958 if (u == NULL)
4959 goto onError;
4960 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4961 if (v == NULL)
4962 goto onError;
4963
Thomas Wouters7e474022000-07-16 12:04:32 +00004964 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 if (v == u) {
4966 Py_DECREF(u);
4967 Py_DECREF(v);
4968 return 0;
4969 }
4970
4971 result = unicode_compare(u, v);
4972
4973 Py_DECREF(u);
4974 Py_DECREF(v);
4975 return result;
4976
4977onError:
4978 Py_XDECREF(u);
4979 Py_XDECREF(v);
4980 return -1;
4981}
4982
Guido van Rossum403d68b2000-03-13 15:55:09 +00004983int PyUnicode_Contains(PyObject *container,
4984 PyObject *element)
4985{
4986 PyUnicodeObject *u = NULL, *v = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004987 int result;
4988 Py_ssize_t size;
Barry Warsaw817918c2002-08-06 16:58:21 +00004989 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004990
4991 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004992 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004993 if (v == NULL) {
4994 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004995 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004996 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004997 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004998 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004999 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005000 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005001
Barry Warsaw817918c2002-08-06 16:58:21 +00005002 size = PyUnicode_GET_SIZE(v);
5003 rhs = PyUnicode_AS_UNICODE(v);
5004 lhs = PyUnicode_AS_UNICODE(u);
5005
Guido van Rossum403d68b2000-03-13 15:55:09 +00005006 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00005007 if (size == 1) {
5008 end = lhs + PyUnicode_GET_SIZE(u);
5009 while (lhs < end) {
5010 if (*lhs++ == *rhs) {
5011 result = 1;
5012 break;
5013 }
5014 }
5015 }
5016 else {
5017 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5018 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005019 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005020 result = 1;
5021 break;
5022 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005023 }
5024 }
5025
5026 Py_DECREF(u);
5027 Py_DECREF(v);
5028 return result;
5029
5030onError:
5031 Py_XDECREF(u);
5032 Py_XDECREF(v);
5033 return -1;
5034}
5035
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036/* Concat to string or Unicode object giving a new Unicode object. */
5037
5038PyObject *PyUnicode_Concat(PyObject *left,
5039 PyObject *right)
5040{
5041 PyUnicodeObject *u = NULL, *v = NULL, *w;
5042
5043 /* Coerce the two arguments */
5044 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5045 if (u == NULL)
5046 goto onError;
5047 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5048 if (v == NULL)
5049 goto onError;
5050
5051 /* Shortcuts */
5052 if (v == unicode_empty) {
5053 Py_DECREF(v);
5054 return (PyObject *)u;
5055 }
5056 if (u == unicode_empty) {
5057 Py_DECREF(u);
5058 return (PyObject *)v;
5059 }
5060
5061 /* Concat the two Unicode strings */
5062 w = _PyUnicode_New(u->length + v->length);
5063 if (w == NULL)
5064 goto onError;
5065 Py_UNICODE_COPY(w->str, u->str, u->length);
5066 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5067
5068 Py_DECREF(u);
5069 Py_DECREF(v);
5070 return (PyObject *)w;
5071
5072onError:
5073 Py_XDECREF(u);
5074 Py_XDECREF(v);
5075 return NULL;
5076}
5077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005078PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079"S.count(sub[, start[, end]]) -> int\n\
5080\n\
5081Return the number of occurrences of substring sub in Unicode string\n\
5082S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005083interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084
5085static PyObject *
5086unicode_count(PyUnicodeObject *self, PyObject *args)
5087{
5088 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005089 Py_ssize_t start = 0;
5090 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 PyObject *result;
5092
Guido van Rossumb8872e62000-05-09 14:14:27 +00005093 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5094 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 return NULL;
5096
5097 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5098 (PyObject *)substring);
5099 if (substring == NULL)
5100 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005101
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 if (start < 0)
5103 start += self->length;
5104 if (start < 0)
5105 start = 0;
5106 if (end > self->length)
5107 end = self->length;
5108 if (end < 0)
5109 end += self->length;
5110 if (end < 0)
5111 end = 0;
5112
5113 result = PyInt_FromLong((long) count(self, start, end, substring));
5114
5115 Py_DECREF(substring);
5116 return result;
5117}
5118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005119PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005120"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005122Encodes S using the codec registered for encoding. encoding defaults\n\
5123to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005124handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5126'xmlcharrefreplace' as well as any other name registered with\n\
5127codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128
5129static PyObject *
5130unicode_encode(PyUnicodeObject *self, PyObject *args)
5131{
5132 char *encoding = NULL;
5133 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005134 PyObject *v;
5135
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5137 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005138 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005139 if (v == NULL)
5140 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005141 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5142 PyErr_Format(PyExc_TypeError,
5143 "encoder did not return a string/unicode object "
5144 "(type=%.400s)",
5145 v->ob_type->tp_name);
5146 Py_DECREF(v);
5147 return NULL;
5148 }
5149 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005150
5151 onError:
5152 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005153}
5154
5155PyDoc_STRVAR(decode__doc__,
5156"S.decode([encoding[,errors]]) -> string or unicode\n\
5157\n\
5158Decodes S using the codec registered for encoding. encoding defaults\n\
5159to the default encoding. errors may be given to set a different error\n\
5160handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5161a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5162as well as any other name registerd with codecs.register_error that is\n\
5163able to handle UnicodeDecodeErrors.");
5164
5165static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005166unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005167{
5168 char *encoding = NULL;
5169 char *errors = NULL;
5170 PyObject *v;
5171
5172 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5173 return NULL;
5174 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005175 if (v == NULL)
5176 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005177 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5178 PyErr_Format(PyExc_TypeError,
5179 "decoder did not return a string/unicode object "
5180 "(type=%.400s)",
5181 v->ob_type->tp_name);
5182 Py_DECREF(v);
5183 return NULL;
5184 }
5185 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005186
5187 onError:
5188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189}
5190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005191PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192"S.expandtabs([tabsize]) -> unicode\n\
5193\n\
5194Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005195If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
5197static PyObject*
5198unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5199{
5200 Py_UNICODE *e;
5201 Py_UNICODE *p;
5202 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 PyUnicodeObject *u;
5205 int tabsize = 8;
5206
5207 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5208 return NULL;
5209
Thomas Wouters7e474022000-07-16 12:04:32 +00005210 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 i = j = 0;
5212 e = self->str + self->length;
5213 for (p = self->str; p < e; p++)
5214 if (*p == '\t') {
5215 if (tabsize > 0)
5216 j += tabsize - (j % tabsize);
5217 }
5218 else {
5219 j++;
5220 if (*p == '\n' || *p == '\r') {
5221 i += j;
5222 j = 0;
5223 }
5224 }
5225
5226 /* Second pass: create output string and fill it */
5227 u = _PyUnicode_New(i + j);
5228 if (!u)
5229 return NULL;
5230
5231 j = 0;
5232 q = u->str;
5233
5234 for (p = self->str; p < e; p++)
5235 if (*p == '\t') {
5236 if (tabsize > 0) {
5237 i = tabsize - (j % tabsize);
5238 j += i;
5239 while (i--)
5240 *q++ = ' ';
5241 }
5242 }
5243 else {
5244 j++;
5245 *q++ = *p;
5246 if (*p == '\n' || *p == '\r')
5247 j = 0;
5248 }
5249
5250 return (PyObject*) u;
5251}
5252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005253PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254"S.find(sub [,start [,end]]) -> int\n\
5255\n\
5256Return the lowest index in S where substring sub is found,\n\
5257such that sub is contained within s[start,end]. Optional\n\
5258arguments start and end are interpreted as in slice notation.\n\
5259\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005260Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261
5262static PyObject *
5263unicode_find(PyUnicodeObject *self, PyObject *args)
5264{
5265 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005266 Py_ssize_t start = 0;
5267 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 PyObject *result;
5269
Guido van Rossumb8872e62000-05-09 14:14:27 +00005270 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5271 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 return NULL;
5273 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5274 (PyObject *)substring);
5275 if (substring == NULL)
5276 return NULL;
5277
Martin v. Löwis18e16552006-02-15 17:27:45 +00005278 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279
5280 Py_DECREF(substring);
5281 return result;
5282}
5283
5284static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005285unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286{
5287 if (index < 0 || index >= self->length) {
5288 PyErr_SetString(PyExc_IndexError, "string index out of range");
5289 return NULL;
5290 }
5291
5292 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5293}
5294
5295static long
5296unicode_hash(PyUnicodeObject *self)
5297{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005298 /* Since Unicode objects compare equal to their ASCII string
5299 counterparts, they should use the individual character values
5300 as basis for their hash value. This is needed to assure that
5301 strings and Unicode objects behave in the same way as
5302 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303
Martin v. Löwis18e16552006-02-15 17:27:45 +00005304 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005305 register Py_UNICODE *p;
5306 register long x;
5307
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 if (self->hash != -1)
5309 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005310 len = PyUnicode_GET_SIZE(self);
5311 p = PyUnicode_AS_UNICODE(self);
5312 x = *p << 7;
5313 while (--len >= 0)
5314 x = (1000003*x) ^ *p++;
5315 x ^= PyUnicode_GET_SIZE(self);
5316 if (x == -1)
5317 x = -2;
5318 self->hash = x;
5319 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320}
5321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005322PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323"S.index(sub [,start [,end]]) -> int\n\
5324\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005325Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326
5327static PyObject *
5328unicode_index(PyUnicodeObject *self, PyObject *args)
5329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005332 Py_ssize_t start = 0;
5333 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334
Guido van Rossumb8872e62000-05-09 14:14:27 +00005335 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5336 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005338
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5340 (PyObject *)substring);
5341 if (substring == NULL)
5342 return NULL;
5343
5344 result = findstring(self, substring, start, end, 1);
5345
5346 Py_DECREF(substring);
5347 if (result < 0) {
5348 PyErr_SetString(PyExc_ValueError, "substring not found");
5349 return NULL;
5350 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005351 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352}
5353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005354PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005355"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005357Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005358at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
5360static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005361unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362{
5363 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5364 register const Py_UNICODE *e;
5365 int cased;
5366
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 /* Shortcut for single character strings */
5368 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005369 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005371 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005372 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005373 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 e = p + PyUnicode_GET_SIZE(self);
5376 cased = 0;
5377 for (; p < e; p++) {
5378 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005379
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005381 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 else if (!cased && Py_UNICODE_ISLOWER(ch))
5383 cased = 1;
5384 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005385 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386}
5387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005388PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005389"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005391Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005392at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393
5394static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005395unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396{
5397 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5398 register const Py_UNICODE *e;
5399 int cased;
5400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 /* Shortcut for single character strings */
5402 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005403 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005405 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005406 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005407 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005408
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 e = p + PyUnicode_GET_SIZE(self);
5410 cased = 0;
5411 for (; p < e; p++) {
5412 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005413
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005415 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 else if (!cased && Py_UNICODE_ISUPPER(ch))
5417 cased = 1;
5418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005419 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420}
5421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005422PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005423"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005425Return True if S is a titlecased string and there is at least one\n\
5426character in S, i.e. upper- and titlecase characters may only\n\
5427follow uncased characters and lowercase characters only cased ones.\n\
5428Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429
5430static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005431unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432{
5433 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5434 register const Py_UNICODE *e;
5435 int cased, previous_is_cased;
5436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 /* Shortcut for single character strings */
5438 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005439 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5440 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005442 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005443 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005444 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005445
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 e = p + PyUnicode_GET_SIZE(self);
5447 cased = 0;
5448 previous_is_cased = 0;
5449 for (; p < e; p++) {
5450 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005451
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5453 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005454 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 previous_is_cased = 1;
5456 cased = 1;
5457 }
5458 else if (Py_UNICODE_ISLOWER(ch)) {
5459 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005460 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 previous_is_cased = 1;
5462 cased = 1;
5463 }
5464 else
5465 previous_is_cased = 0;
5466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005467 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468}
5469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005470PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005471"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005473Return True if all characters in S are whitespace\n\
5474and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
5476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005477unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478{
5479 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5480 register const Py_UNICODE *e;
5481
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 /* Shortcut for single character strings */
5483 if (PyUnicode_GET_SIZE(self) == 1 &&
5484 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005485 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005487 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005488 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005489 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005490
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 e = p + PyUnicode_GET_SIZE(self);
5492 for (; p < e; p++) {
5493 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005494 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005496 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497}
5498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005499PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005500"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005501\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005502Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005503and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005504
5505static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005506unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005507{
5508 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5509 register const Py_UNICODE *e;
5510
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005511 /* Shortcut for single character strings */
5512 if (PyUnicode_GET_SIZE(self) == 1 &&
5513 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005514 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005515
5516 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005517 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005518 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005519
5520 e = p + PyUnicode_GET_SIZE(self);
5521 for (; p < e; p++) {
5522 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005523 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005525 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005526}
5527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005528PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005529"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005530\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005531Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005532and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005533
5534static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005535unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005536{
5537 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5538 register const Py_UNICODE *e;
5539
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005540 /* Shortcut for single character strings */
5541 if (PyUnicode_GET_SIZE(self) == 1 &&
5542 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005543 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005544
5545 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005546 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005547 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005548
5549 e = p + PyUnicode_GET_SIZE(self);
5550 for (; p < e; p++) {
5551 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005552 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005553 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005554 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005555}
5556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005557PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005558"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005560Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005561False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
5563static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005564unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565{
5566 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5567 register const Py_UNICODE *e;
5568
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 /* Shortcut for single character strings */
5570 if (PyUnicode_GET_SIZE(self) == 1 &&
5571 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005572 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005574 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005575 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005576 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005577
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 e = p + PyUnicode_GET_SIZE(self);
5579 for (; p < e; p++) {
5580 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005581 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005583 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584}
5585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005586PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005587"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005589Return True if all characters in S are digits\n\
5590and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591
5592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005593unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594{
5595 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5596 register const Py_UNICODE *e;
5597
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 /* Shortcut for single character strings */
5599 if (PyUnicode_GET_SIZE(self) == 1 &&
5600 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005601 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005603 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005604 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005605 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005606
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 e = p + PyUnicode_GET_SIZE(self);
5608 for (; p < e; p++) {
5609 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005610 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005612 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613}
5614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005615PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005616"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005618Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005619False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620
5621static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005622unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623{
5624 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5625 register const Py_UNICODE *e;
5626
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 /* Shortcut for single character strings */
5628 if (PyUnicode_GET_SIZE(self) == 1 &&
5629 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005630 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005632 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005633 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005634 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005635
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 e = p + PyUnicode_GET_SIZE(self);
5637 for (; p < e; p++) {
5638 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005639 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005641 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642}
5643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005644PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645"S.join(sequence) -> unicode\n\
5646\n\
5647Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005648sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
5650static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005651unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005653 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654}
5655
Martin v. Löwis18e16552006-02-15 17:27:45 +00005656static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657unicode_length(PyUnicodeObject *self)
5658{
5659 return self->length;
5660}
5661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005662PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005663"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664\n\
5665Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005666done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667
5668static PyObject *
5669unicode_ljust(PyUnicodeObject *self, PyObject *args)
5670{
5671 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005672 Py_UNICODE fillchar = ' ';
5673
5674 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 return NULL;
5676
Tim Peters7a29bd52001-09-12 03:03:31 +00005677 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 Py_INCREF(self);
5679 return (PyObject*) self;
5680 }
5681
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005682 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683}
5684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005685PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686"S.lower() -> unicode\n\
5687\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005688Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
5690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005691unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 return fixup(self, fixlower);
5694}
5695
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005696#define LEFTSTRIP 0
5697#define RIGHTSTRIP 1
5698#define BOTHSTRIP 2
5699
5700/* Arrays indexed by above */
5701static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5702
5703#define STRIPNAME(i) (stripformat[i]+3)
5704
5705static const Py_UNICODE *
5706unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5707{
Tim Peters030a5ce2002-04-22 19:00:10 +00005708 size_t i;
5709 for (i = 0; i < n; ++i)
5710 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005711 return s+i;
5712 return NULL;
5713}
5714
5715/* externally visible for str.strip(unicode) */
5716PyObject *
5717_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5718{
5719 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005720 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005721 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005722 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5723 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005724
5725 i = 0;
5726 if (striptype != RIGHTSTRIP) {
5727 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5728 i++;
5729 }
5730 }
5731
5732 j = len;
5733 if (striptype != LEFTSTRIP) {
5734 do {
5735 j--;
5736 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5737 j++;
5738 }
5739
5740 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5741 Py_INCREF(self);
5742 return (PyObject*)self;
5743 }
5744 else
5745 return PyUnicode_FromUnicode(s+i, j-i);
5746}
5747
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
5749static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005750do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005752 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005753 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005754
5755 i = 0;
5756 if (striptype != RIGHTSTRIP) {
5757 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5758 i++;
5759 }
5760 }
5761
5762 j = len;
5763 if (striptype != LEFTSTRIP) {
5764 do {
5765 j--;
5766 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5767 j++;
5768 }
5769
5770 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5771 Py_INCREF(self);
5772 return (PyObject*)self;
5773 }
5774 else
5775 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776}
5777
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005778
5779static PyObject *
5780do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5781{
5782 PyObject *sep = NULL;
5783
5784 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5785 return NULL;
5786
5787 if (sep != NULL && sep != Py_None) {
5788 if (PyUnicode_Check(sep))
5789 return _PyUnicode_XStrip(self, striptype, sep);
5790 else if (PyString_Check(sep)) {
5791 PyObject *res;
5792 sep = PyUnicode_FromObject(sep);
5793 if (sep==NULL)
5794 return NULL;
5795 res = _PyUnicode_XStrip(self, striptype, sep);
5796 Py_DECREF(sep);
5797 return res;
5798 }
5799 else {
5800 PyErr_Format(PyExc_TypeError,
5801 "%s arg must be None, unicode or str",
5802 STRIPNAME(striptype));
5803 return NULL;
5804 }
5805 }
5806
5807 return do_strip(self, striptype);
5808}
5809
5810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005811PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005812"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005813\n\
5814Return a copy of the string S with leading and trailing\n\
5815whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005816If chars is given and not None, remove characters in chars instead.\n\
5817If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005818
5819static PyObject *
5820unicode_strip(PyUnicodeObject *self, PyObject *args)
5821{
5822 if (PyTuple_GET_SIZE(args) == 0)
5823 return do_strip(self, BOTHSTRIP); /* Common case */
5824 else
5825 return do_argstrip(self, BOTHSTRIP, args);
5826}
5827
5828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005829PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005830"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005831\n\
5832Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005833If chars is given and not None, remove characters in chars instead.\n\
5834If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005835
5836static PyObject *
5837unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5838{
5839 if (PyTuple_GET_SIZE(args) == 0)
5840 return do_strip(self, LEFTSTRIP); /* Common case */
5841 else
5842 return do_argstrip(self, LEFTSTRIP, args);
5843}
5844
5845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005846PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005847"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005848\n\
5849Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005850If chars is given and not None, remove characters in chars instead.\n\
5851If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005852
5853static PyObject *
5854unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5855{
5856 if (PyTuple_GET_SIZE(args) == 0)
5857 return do_strip(self, RIGHTSTRIP); /* Common case */
5858 else
5859 return do_argstrip(self, RIGHTSTRIP, args);
5860}
5861
5862
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005864unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865{
5866 PyUnicodeObject *u;
5867 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005869 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
5871 if (len < 0)
5872 len = 0;
5873
Tim Peters7a29bd52001-09-12 03:03:31 +00005874 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 /* no repeat, return original string */
5876 Py_INCREF(str);
5877 return (PyObject*) str;
5878 }
Tim Peters8f422462000-09-09 06:13:41 +00005879
5880 /* ensure # of chars needed doesn't overflow int and # of bytes
5881 * needed doesn't overflow size_t
5882 */
5883 nchars = len * str->length;
5884 if (len && nchars / len != str->length) {
5885 PyErr_SetString(PyExc_OverflowError,
5886 "repeated string is too long");
5887 return NULL;
5888 }
5889 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5890 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5891 PyErr_SetString(PyExc_OverflowError,
5892 "repeated string is too long");
5893 return NULL;
5894 }
5895 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 if (!u)
5897 return NULL;
5898
5899 p = u->str;
5900
5901 while (len-- > 0) {
5902 Py_UNICODE_COPY(p, str->str, str->length);
5903 p += str->length;
5904 }
5905
5906 return (PyObject*) u;
5907}
5908
5909PyObject *PyUnicode_Replace(PyObject *obj,
5910 PyObject *subobj,
5911 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005912 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913{
5914 PyObject *self;
5915 PyObject *str1;
5916 PyObject *str2;
5917 PyObject *result;
5918
5919 self = PyUnicode_FromObject(obj);
5920 if (self == NULL)
5921 return NULL;
5922 str1 = PyUnicode_FromObject(subobj);
5923 if (str1 == NULL) {
5924 Py_DECREF(self);
5925 return NULL;
5926 }
5927 str2 = PyUnicode_FromObject(replobj);
5928 if (str2 == NULL) {
5929 Py_DECREF(self);
5930 Py_DECREF(str1);
5931 return NULL;
5932 }
Tim Petersced69f82003-09-16 20:30:58 +00005933 result = replace((PyUnicodeObject *)self,
5934 (PyUnicodeObject *)str1,
5935 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 maxcount);
5937 Py_DECREF(self);
5938 Py_DECREF(str1);
5939 Py_DECREF(str2);
5940 return result;
5941}
5942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005943PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944"S.replace (old, new[, maxsplit]) -> unicode\n\
5945\n\
5946Return a copy of S with all occurrences of substring\n\
5947old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005948given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949
5950static PyObject*
5951unicode_replace(PyUnicodeObject *self, PyObject *args)
5952{
5953 PyUnicodeObject *str1;
5954 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005955 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 PyObject *result;
5957
Martin v. Löwis18e16552006-02-15 17:27:45 +00005958 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 return NULL;
5960 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5961 if (str1 == NULL)
5962 return NULL;
5963 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005964 if (str2 == NULL) {
5965 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968
5969 result = replace(self, str1, str2, maxcount);
5970
5971 Py_DECREF(str1);
5972 Py_DECREF(str2);
5973 return result;
5974}
5975
5976static
5977PyObject *unicode_repr(PyObject *unicode)
5978{
5979 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5980 PyUnicode_GET_SIZE(unicode),
5981 1);
5982}
5983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005984PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985"S.rfind(sub [,start [,end]]) -> int\n\
5986\n\
5987Return the highest index in S where substring sub is found,\n\
5988such that sub is contained within s[start,end]. Optional\n\
5989arguments start and end are interpreted as in slice notation.\n\
5990\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005991Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992
5993static PyObject *
5994unicode_rfind(PyUnicodeObject *self, PyObject *args)
5995{
5996 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005997 Py_ssize_t start = 0;
5998 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 PyObject *result;
6000
Guido van Rossumb8872e62000-05-09 14:14:27 +00006001 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6002 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 return NULL;
6004 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6005 (PyObject *)substring);
6006 if (substring == NULL)
6007 return NULL;
6008
Martin v. Löwis18e16552006-02-15 17:27:45 +00006009 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
6011 Py_DECREF(substring);
6012 return result;
6013}
6014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006015PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016"S.rindex(sub [,start [,end]]) -> int\n\
6017\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
6020static PyObject *
6021unicode_rindex(PyUnicodeObject *self, PyObject *args)
6022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006023 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006025 Py_ssize_t start = 0;
6026 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027
Guido van Rossumb8872e62000-05-09 14:14:27 +00006028 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6029 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 return NULL;
6031 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6032 (PyObject *)substring);
6033 if (substring == NULL)
6034 return NULL;
6035
6036 result = findstring(self, substring, start, end, -1);
6037
6038 Py_DECREF(substring);
6039 if (result < 0) {
6040 PyErr_SetString(PyExc_ValueError, "substring not found");
6041 return NULL;
6042 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006043 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044}
6045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006046PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006047"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048\n\
6049Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006050done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
6052static PyObject *
6053unicode_rjust(PyUnicodeObject *self, PyObject *args)
6054{
6055 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006056 Py_UNICODE fillchar = ' ';
6057
6058 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 return NULL;
6060
Tim Peters7a29bd52001-09-12 03:03:31 +00006061 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 Py_INCREF(self);
6063 return (PyObject*) self;
6064 }
6065
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006066 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067}
6068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006070unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071{
6072 /* standard clamping */
6073 if (start < 0)
6074 start = 0;
6075 if (end < 0)
6076 end = 0;
6077 if (end > self->length)
6078 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006079 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 /* full slice, return original string */
6081 Py_INCREF(self);
6082 return (PyObject*) self;
6083 }
6084 if (start > end)
6085 start = end;
6086 /* copy slice */
6087 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6088 end - start);
6089}
6090
6091PyObject *PyUnicode_Split(PyObject *s,
6092 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006093 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094{
6095 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006096
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 s = PyUnicode_FromObject(s);
6098 if (s == NULL)
6099 return NULL;
6100 if (sep != NULL) {
6101 sep = PyUnicode_FromObject(sep);
6102 if (sep == NULL) {
6103 Py_DECREF(s);
6104 return NULL;
6105 }
6106 }
6107
6108 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6109
6110 Py_DECREF(s);
6111 Py_XDECREF(sep);
6112 return result;
6113}
6114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006115PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116"S.split([sep [,maxsplit]]) -> list of strings\n\
6117\n\
6118Return a list of the words in S, using sep as the\n\
6119delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006120splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006121any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122
6123static PyObject*
6124unicode_split(PyUnicodeObject *self, PyObject *args)
6125{
6126 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006127 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 return NULL;
6131
6132 if (substring == Py_None)
6133 return split(self, NULL, maxcount);
6134 else if (PyUnicode_Check(substring))
6135 return split(self, (PyUnicodeObject *)substring, maxcount);
6136 else
6137 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6138}
6139
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006140PyObject *PyUnicode_RSplit(PyObject *s,
6141 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006142 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006143{
6144 PyObject *result;
6145
6146 s = PyUnicode_FromObject(s);
6147 if (s == NULL)
6148 return NULL;
6149 if (sep != NULL) {
6150 sep = PyUnicode_FromObject(sep);
6151 if (sep == NULL) {
6152 Py_DECREF(s);
6153 return NULL;
6154 }
6155 }
6156
6157 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6158
6159 Py_DECREF(s);
6160 Py_XDECREF(sep);
6161 return result;
6162}
6163
6164PyDoc_STRVAR(rsplit__doc__,
6165"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6166\n\
6167Return a list of the words in S, using sep as the\n\
6168delimiter string, starting at the end of the string and\n\
6169working to the front. If maxsplit is given, at most maxsplit\n\
6170splits are done. If sep is not specified, any whitespace string\n\
6171is a separator.");
6172
6173static PyObject*
6174unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6175{
6176 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006177 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006178
Martin v. Löwis18e16552006-02-15 17:27:45 +00006179 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006180 return NULL;
6181
6182 if (substring == Py_None)
6183 return rsplit(self, NULL, maxcount);
6184 else if (PyUnicode_Check(substring))
6185 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6186 else
6187 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6188}
6189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006190PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006191"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192\n\
6193Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006194Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006195is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
6197static PyObject*
6198unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6199{
Guido van Rossum86662912000-04-11 15:38:46 +00006200 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201
Guido van Rossum86662912000-04-11 15:38:46 +00006202 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 return NULL;
6204
Guido van Rossum86662912000-04-11 15:38:46 +00006205 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206}
6207
6208static
6209PyObject *unicode_str(PyUnicodeObject *self)
6210{
Fred Drakee4315f52000-05-09 19:53:39 +00006211 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212}
6213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006214PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215"S.swapcase() -> unicode\n\
6216\n\
6217Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006218and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219
6220static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006221unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 return fixup(self, fixswapcase);
6224}
6225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006226PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227"S.translate(table) -> unicode\n\
6228\n\
6229Return a copy of the string S, where all characters have been mapped\n\
6230through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006231Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6232Unmapped characters are left untouched. Characters mapped to None\n\
6233are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234
6235static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006236unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237{
Tim Petersced69f82003-09-16 20:30:58 +00006238 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006240 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 "ignore");
6242}
6243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006244PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245"S.upper() -> unicode\n\
6246\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006247Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248
6249static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006250unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 return fixup(self, fixupper);
6253}
6254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006255PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256"S.zfill(width) -> unicode\n\
6257\n\
6258Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006259of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
6261static PyObject *
6262unicode_zfill(PyUnicodeObject *self, PyObject *args)
6263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006264 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 PyUnicodeObject *u;
6266
Martin v. Löwis18e16552006-02-15 17:27:45 +00006267 Py_ssize_t width;
6268 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 return NULL;
6270
6271 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006272 if (PyUnicode_CheckExact(self)) {
6273 Py_INCREF(self);
6274 return (PyObject*) self;
6275 }
6276 else
6277 return PyUnicode_FromUnicode(
6278 PyUnicode_AS_UNICODE(self),
6279 PyUnicode_GET_SIZE(self)
6280 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 }
6282
6283 fill = width - self->length;
6284
6285 u = pad(self, fill, 0, '0');
6286
Walter Dörwald068325e2002-04-15 13:36:47 +00006287 if (u == NULL)
6288 return NULL;
6289
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 if (u->str[fill] == '+' || u->str[fill] == '-') {
6291 /* move sign to beginning of string */
6292 u->str[0] = u->str[fill];
6293 u->str[fill] = '0';
6294 }
6295
6296 return (PyObject*) u;
6297}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
6299#if 0
6300static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006301unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 return PyInt_FromLong(unicode_freelist_size);
6304}
6305#endif
6306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006307PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006308"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006310Return True if S starts with the specified prefix, False otherwise.\n\
6311With optional start, test S beginning at that position.\n\
6312With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
6314static PyObject *
6315unicode_startswith(PyUnicodeObject *self,
6316 PyObject *args)
6317{
6318 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006319 Py_ssize_t start = 0;
6320 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 PyObject *result;
6322
Guido van Rossumb8872e62000-05-09 14:14:27 +00006323 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6324 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 return NULL;
6326 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6327 (PyObject *)substring);
6328 if (substring == NULL)
6329 return NULL;
6330
Guido van Rossum77f6a652002-04-03 22:41:51 +00006331 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
6333 Py_DECREF(substring);
6334 return result;
6335}
6336
6337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006338PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006339"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006341Return True if S ends with the specified suffix, False otherwise.\n\
6342With optional start, test S beginning at that position.\n\
6343With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344
6345static PyObject *
6346unicode_endswith(PyUnicodeObject *self,
6347 PyObject *args)
6348{
6349 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006350 Py_ssize_t start = 0;
6351 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 PyObject *result;
6353
Guido van Rossumb8872e62000-05-09 14:14:27 +00006354 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6355 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 return NULL;
6357 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6358 (PyObject *)substring);
6359 if (substring == NULL)
6360 return NULL;
6361
Guido van Rossum77f6a652002-04-03 22:41:51 +00006362 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364 Py_DECREF(substring);
6365 return result;
6366}
6367
6368
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006369
6370static PyObject *
6371unicode_getnewargs(PyUnicodeObject *v)
6372{
6373 return Py_BuildValue("(u#)", v->str, v->length);
6374}
6375
6376
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377static PyMethodDef unicode_methods[] = {
6378
6379 /* Order is according to common usage: often used methods should
6380 appear first, since lookup is done sequentially. */
6381
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006382 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6383 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6384 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006385 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006386 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6387 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6388 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6389 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6390 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6391 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6392 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6393 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6394 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6395 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006396 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006397 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006398/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6399 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6400 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6401 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006402 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006403 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006404 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006405 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6406 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6407 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6408 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6409 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6410 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6411 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6412 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6413 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6414 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6415 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6416 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6417 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6418 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006419 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006420#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006421 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422#endif
6423
6424#if 0
6425 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006426 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427#endif
6428
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006429 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 {NULL, NULL}
6431};
6432
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006433static PyObject *
6434unicode_mod(PyObject *v, PyObject *w)
6435{
6436 if (!PyUnicode_Check(v)) {
6437 Py_INCREF(Py_NotImplemented);
6438 return Py_NotImplemented;
6439 }
6440 return PyUnicode_Format(v, w);
6441}
6442
6443static PyNumberMethods unicode_as_number = {
6444 0, /*nb_add*/
6445 0, /*nb_subtract*/
6446 0, /*nb_multiply*/
6447 0, /*nb_divide*/
6448 unicode_mod, /*nb_remainder*/
6449};
6450
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006452 (lenfunc) unicode_length, /* sq_length */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 (binaryfunc) PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006454 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6455 (ssizeargfunc) unicode_getitem, /* sq_item */
6456 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 0, /* sq_ass_item */
6458 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006459 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460};
6461
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006462static PyObject*
6463unicode_subscript(PyUnicodeObject* self, PyObject* item)
6464{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006465 if (PyInt_Check(item) || PyLong_Check(item)) {
6466 Py_ssize_t i = PyInt_AsSsize_t(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006467 if (i == -1 && PyErr_Occurred())
6468 return NULL;
6469 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006470 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006471 return unicode_getitem(self, i);
6472 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006473 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006474 Py_UNICODE* source_buf;
6475 Py_UNICODE* result_buf;
6476 PyObject* result;
6477
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006478 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006479 &start, &stop, &step, &slicelength) < 0) {
6480 return NULL;
6481 }
6482
6483 if (slicelength <= 0) {
6484 return PyUnicode_FromUnicode(NULL, 0);
6485 } else {
6486 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6487 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006488
6489 if (result_buf == NULL)
6490 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006491
6492 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6493 result_buf[i] = source_buf[cur];
6494 }
Tim Petersced69f82003-09-16 20:30:58 +00006495
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006496 result = PyUnicode_FromUnicode(result_buf, slicelength);
6497 PyMem_FREE(result_buf);
6498 return result;
6499 }
6500 } else {
6501 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6502 return NULL;
6503 }
6504}
6505
6506static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006507 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006508 (binaryfunc)unicode_subscript, /* mp_subscript */
6509 (objobjargproc)0, /* mp_ass_subscript */
6510};
6511
Martin v. Löwis18e16552006-02-15 17:27:45 +00006512static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006514 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 const void **ptr)
6516{
6517 if (index != 0) {
6518 PyErr_SetString(PyExc_SystemError,
6519 "accessing non-existent unicode segment");
6520 return -1;
6521 }
6522 *ptr = (void *) self->str;
6523 return PyUnicode_GET_DATA_SIZE(self);
6524}
6525
Martin v. Löwis18e16552006-02-15 17:27:45 +00006526static Py_ssize_t
6527unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 const void **ptr)
6529{
6530 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006531 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 return -1;
6533}
6534
6535static int
6536unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006537 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538{
6539 if (lenp)
6540 *lenp = PyUnicode_GET_DATA_SIZE(self);
6541 return 1;
6542}
6543
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006544static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006546 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 const void **ptr)
6548{
6549 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006550
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 if (index != 0) {
6552 PyErr_SetString(PyExc_SystemError,
6553 "accessing non-existent unicode segment");
6554 return -1;
6555 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006556 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 if (str == NULL)
6558 return -1;
6559 *ptr = (void *) PyString_AS_STRING(str);
6560 return PyString_GET_SIZE(str);
6561}
6562
6563/* Helpers for PyUnicode_Format() */
6564
6565static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006566getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006568 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 if (argidx < arglen) {
6570 (*p_argidx)++;
6571 if (arglen < 0)
6572 return args;
6573 else
6574 return PyTuple_GetItem(args, argidx);
6575 }
6576 PyErr_SetString(PyExc_TypeError,
6577 "not enough arguments for format string");
6578 return NULL;
6579}
6580
6581#define F_LJUST (1<<0)
6582#define F_SIGN (1<<1)
6583#define F_BLANK (1<<2)
6584#define F_ALT (1<<3)
6585#define F_ZERO (1<<4)
6586
Martin v. Löwis18e16552006-02-15 17:27:45 +00006587static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006588strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006590 register Py_ssize_t i;
6591 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 for (i = len - 1; i >= 0; i--)
6593 buffer[i] = (Py_UNICODE) charbuffer[i];
6594
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 return len;
6596}
6597
Neal Norwitzfc76d632006-01-10 06:03:13 +00006598static int
6599doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6600{
Tim Peters15231542006-02-16 01:08:01 +00006601 Py_ssize_t result;
6602
Neal Norwitzfc76d632006-01-10 06:03:13 +00006603 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006604 result = strtounicode(buffer, (char *)buffer);
6605 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006606}
6607
6608static int
6609longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6610{
Tim Peters15231542006-02-16 01:08:01 +00006611 Py_ssize_t result;
6612
Neal Norwitzfc76d632006-01-10 06:03:13 +00006613 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006614 result = strtounicode(buffer, (char *)buffer);
6615 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006616}
6617
Guido van Rossum078151d2002-08-11 04:24:12 +00006618/* XXX To save some code duplication, formatfloat/long/int could have been
6619 shared with stringobject.c, converting from 8-bit to Unicode after the
6620 formatting is done. */
6621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622static int
6623formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006624 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 int flags,
6626 int prec,
6627 int type,
6628 PyObject *v)
6629{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006630 /* fmt = '%#.' + `prec` + `type`
6631 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 char fmt[20];
6633 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006634
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 x = PyFloat_AsDouble(v);
6636 if (x == -1.0 && PyErr_Occurred())
6637 return -1;
6638 if (prec < 0)
6639 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6641 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006642 /* Worst case length calc to ensure no buffer overrun:
6643
6644 'g' formats:
6645 fmt = %#.<prec>g
6646 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6647 for any double rep.)
6648 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6649
6650 'f' formats:
6651 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6652 len = 1 + 50 + 1 + prec = 52 + prec
6653
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006654 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006655 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006656
6657 */
6658 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6659 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006660 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006661 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006662 return -1;
6663 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006664 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6665 (flags&F_ALT) ? "#" : "",
6666 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006667 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668}
6669
Tim Peters38fd5b62000-09-21 05:43:11 +00006670static PyObject*
6671formatlong(PyObject *val, int flags, int prec, int type)
6672{
6673 char *buf;
6674 int i, len;
6675 PyObject *str; /* temporary string object. */
6676 PyUnicodeObject *result;
6677
6678 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6679 if (!str)
6680 return NULL;
6681 result = _PyUnicode_New(len);
6682 for (i = 0; i < len; i++)
6683 result->str[i] = buf[i];
6684 result->str[len] = 0;
6685 Py_DECREF(str);
6686 return (PyObject*)result;
6687}
6688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689static int
6690formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006691 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 int flags,
6693 int prec,
6694 int type,
6695 PyObject *v)
6696{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006697 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006698 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6699 * + 1 + 1
6700 * = 24
6701 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006702 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006703 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 long x;
6705
6706 x = PyInt_AsLong(v);
6707 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006708 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006709 if (x < 0 && type == 'u') {
6710 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006711 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006712 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6713 sign = "-";
6714 else
6715 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006717 prec = 1;
6718
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006719 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6720 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006721 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006722 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006723 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006724 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006725 return -1;
6726 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006727
6728 if ((flags & F_ALT) &&
6729 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006730 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006731 * of issues that cause pain:
6732 * - when 0 is being converted, the C standard leaves off
6733 * the '0x' or '0X', which is inconsistent with other
6734 * %#x/%#X conversions and inconsistent with Python's
6735 * hex() function
6736 * - there are platforms that violate the standard and
6737 * convert 0 with the '0x' or '0X'
6738 * (Metrowerks, Compaq Tru64)
6739 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006740 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006741 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006742 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006743 * We can achieve the desired consistency by inserting our
6744 * own '0x' or '0X' prefix, and substituting %x/%X in place
6745 * of %#x/%#X.
6746 *
6747 * Note that this is the same approach as used in
6748 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006749 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006750 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6751 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006752 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006753 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006754 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6755 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006756 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006757 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006758 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006759 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006760 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006761 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762}
6763
6764static int
6765formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006766 size_t buflen,
6767 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006769 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006770 if (PyUnicode_Check(v)) {
6771 if (PyUnicode_GET_SIZE(v) != 1)
6772 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006776 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006777 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006778 goto onError;
6779 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
6782 else {
6783 /* Integer input truncated to a character */
6784 long x;
6785 x = PyInt_AsLong(v);
6786 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006787 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006788#ifdef Py_UNICODE_WIDE
6789 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006790 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006791 "%c arg not in range(0x110000) "
6792 "(wide Python build)");
6793 return -1;
6794 }
6795#else
6796 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006797 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006798 "%c arg not in range(0x10000) "
6799 "(narrow Python build)");
6800 return -1;
6801 }
6802#endif
6803 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 }
6805 buf[1] = '\0';
6806 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006807
6808 onError:
6809 PyErr_SetString(PyExc_TypeError,
6810 "%c requires int or char");
6811 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812}
6813
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006814/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6815
6816 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6817 chars are formatted. XXX This is a magic number. Each formatting
6818 routine does bounds checking to ensure no overflow, but a better
6819 solution may be to malloc a buffer of appropriate size for each
6820 format. For now, the current solution is sufficient.
6821*/
6822#define FORMATBUFLEN (size_t)120
6823
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824PyObject *PyUnicode_Format(PyObject *format,
6825 PyObject *args)
6826{
6827 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006828 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 int args_owned = 0;
6830 PyUnicodeObject *result = NULL;
6831 PyObject *dict = NULL;
6832 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 if (format == NULL || args == NULL) {
6835 PyErr_BadInternalCall();
6836 return NULL;
6837 }
6838 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006839 if (uformat == NULL)
6840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 fmt = PyUnicode_AS_UNICODE(uformat);
6842 fmtcnt = PyUnicode_GET_SIZE(uformat);
6843
6844 reslen = rescnt = fmtcnt + 100;
6845 result = _PyUnicode_New(reslen);
6846 if (result == NULL)
6847 goto onError;
6848 res = PyUnicode_AS_UNICODE(result);
6849
6850 if (PyTuple_Check(args)) {
6851 arglen = PyTuple_Size(args);
6852 argidx = 0;
6853 }
6854 else {
6855 arglen = -1;
6856 argidx = -2;
6857 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006858 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6859 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 dict = args;
6861
6862 while (--fmtcnt >= 0) {
6863 if (*fmt != '%') {
6864 if (--rescnt < 0) {
6865 rescnt = fmtcnt + 100;
6866 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006867 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 return NULL;
6869 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6870 --rescnt;
6871 }
6872 *res++ = *fmt++;
6873 }
6874 else {
6875 /* Got a format specifier */
6876 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006877 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 Py_UNICODE c = '\0';
6880 Py_UNICODE fill;
6881 PyObject *v = NULL;
6882 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006883 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006885 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006886 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887
6888 fmt++;
6889 if (*fmt == '(') {
6890 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006891 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 PyObject *key;
6893 int pcount = 1;
6894
6895 if (dict == NULL) {
6896 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006897 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 goto onError;
6899 }
6900 ++fmt;
6901 --fmtcnt;
6902 keystart = fmt;
6903 /* Skip over balanced parentheses */
6904 while (pcount > 0 && --fmtcnt >= 0) {
6905 if (*fmt == ')')
6906 --pcount;
6907 else if (*fmt == '(')
6908 ++pcount;
6909 fmt++;
6910 }
6911 keylen = fmt - keystart - 1;
6912 if (fmtcnt < 0 || pcount > 0) {
6913 PyErr_SetString(PyExc_ValueError,
6914 "incomplete format key");
6915 goto onError;
6916 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006917#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006918 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 then looked up since Python uses strings to hold
6920 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006921 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 key = PyUnicode_EncodeUTF8(keystart,
6923 keylen,
6924 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006925#else
6926 key = PyUnicode_FromUnicode(keystart, keylen);
6927#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 if (key == NULL)
6929 goto onError;
6930 if (args_owned) {
6931 Py_DECREF(args);
6932 args_owned = 0;
6933 }
6934 args = PyObject_GetItem(dict, key);
6935 Py_DECREF(key);
6936 if (args == NULL) {
6937 goto onError;
6938 }
6939 args_owned = 1;
6940 arglen = -1;
6941 argidx = -2;
6942 }
6943 while (--fmtcnt >= 0) {
6944 switch (c = *fmt++) {
6945 case '-': flags |= F_LJUST; continue;
6946 case '+': flags |= F_SIGN; continue;
6947 case ' ': flags |= F_BLANK; continue;
6948 case '#': flags |= F_ALT; continue;
6949 case '0': flags |= F_ZERO; continue;
6950 }
6951 break;
6952 }
6953 if (c == '*') {
6954 v = getnextarg(args, arglen, &argidx);
6955 if (v == NULL)
6956 goto onError;
6957 if (!PyInt_Check(v)) {
6958 PyErr_SetString(PyExc_TypeError,
6959 "* wants int");
6960 goto onError;
6961 }
6962 width = PyInt_AsLong(v);
6963 if (width < 0) {
6964 flags |= F_LJUST;
6965 width = -width;
6966 }
6967 if (--fmtcnt >= 0)
6968 c = *fmt++;
6969 }
6970 else if (c >= '0' && c <= '9') {
6971 width = c - '0';
6972 while (--fmtcnt >= 0) {
6973 c = *fmt++;
6974 if (c < '0' || c > '9')
6975 break;
6976 if ((width*10) / 10 != width) {
6977 PyErr_SetString(PyExc_ValueError,
6978 "width too big");
6979 goto onError;
6980 }
6981 width = width*10 + (c - '0');
6982 }
6983 }
6984 if (c == '.') {
6985 prec = 0;
6986 if (--fmtcnt >= 0)
6987 c = *fmt++;
6988 if (c == '*') {
6989 v = getnextarg(args, arglen, &argidx);
6990 if (v == NULL)
6991 goto onError;
6992 if (!PyInt_Check(v)) {
6993 PyErr_SetString(PyExc_TypeError,
6994 "* wants int");
6995 goto onError;
6996 }
6997 prec = PyInt_AsLong(v);
6998 if (prec < 0)
6999 prec = 0;
7000 if (--fmtcnt >= 0)
7001 c = *fmt++;
7002 }
7003 else if (c >= '0' && c <= '9') {
7004 prec = c - '0';
7005 while (--fmtcnt >= 0) {
7006 c = Py_CHARMASK(*fmt++);
7007 if (c < '0' || c > '9')
7008 break;
7009 if ((prec*10) / 10 != prec) {
7010 PyErr_SetString(PyExc_ValueError,
7011 "prec too big");
7012 goto onError;
7013 }
7014 prec = prec*10 + (c - '0');
7015 }
7016 }
7017 } /* prec */
7018 if (fmtcnt >= 0) {
7019 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 if (--fmtcnt >= 0)
7021 c = *fmt++;
7022 }
7023 }
7024 if (fmtcnt < 0) {
7025 PyErr_SetString(PyExc_ValueError,
7026 "incomplete format");
7027 goto onError;
7028 }
7029 if (c != '%') {
7030 v = getnextarg(args, arglen, &argidx);
7031 if (v == NULL)
7032 goto onError;
7033 }
7034 sign = 0;
7035 fill = ' ';
7036 switch (c) {
7037
7038 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007039 pbuf = formatbuf;
7040 /* presume that buffer length is at least 1 */
7041 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 len = 1;
7043 break;
7044
7045 case 's':
7046 case 'r':
7047 if (PyUnicode_Check(v) && c == 's') {
7048 temp = v;
7049 Py_INCREF(temp);
7050 }
7051 else {
7052 PyObject *unicode;
7053 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007054 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 else
7056 temp = PyObject_Repr(v);
7057 if (temp == NULL)
7058 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007059 if (PyUnicode_Check(temp))
7060 /* nothing to do */;
7061 else if (PyString_Check(temp)) {
7062 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00007063 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00007065 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 "strict");
7067 Py_DECREF(temp);
7068 temp = unicode;
7069 if (temp == NULL)
7070 goto onError;
7071 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007072 else {
7073 Py_DECREF(temp);
7074 PyErr_SetString(PyExc_TypeError,
7075 "%s argument has non-string str()");
7076 goto onError;
7077 }
7078 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007079 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 len = PyUnicode_GET_SIZE(temp);
7081 if (prec >= 0 && len > prec)
7082 len = prec;
7083 break;
7084
7085 case 'i':
7086 case 'd':
7087 case 'u':
7088 case 'o':
7089 case 'x':
7090 case 'X':
7091 if (c == 'i')
7092 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007093 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007094 temp = formatlong(v, flags, prec, c);
7095 if (!temp)
7096 goto onError;
7097 pbuf = PyUnicode_AS_UNICODE(temp);
7098 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007099 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007101 else {
7102 pbuf = formatbuf;
7103 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7104 flags, prec, c, v);
7105 if (len < 0)
7106 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007107 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007108 }
7109 if (flags & F_ZERO)
7110 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111 break;
7112
7113 case 'e':
7114 case 'E':
7115 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007116 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 case 'g':
7118 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007119 if (c == 'F')
7120 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007121 pbuf = formatbuf;
7122 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7123 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 if (len < 0)
7125 goto onError;
7126 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007127 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 fill = '0';
7129 break;
7130
7131 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007132 pbuf = formatbuf;
7133 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 if (len < 0)
7135 goto onError;
7136 break;
7137
7138 default:
7139 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007140 "unsupported format character '%c' (0x%x) "
7141 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007142 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007143 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007144 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 goto onError;
7146 }
7147 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007148 if (*pbuf == '-' || *pbuf == '+') {
7149 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 len--;
7151 }
7152 else if (flags & F_SIGN)
7153 sign = '+';
7154 else if (flags & F_BLANK)
7155 sign = ' ';
7156 else
7157 sign = 0;
7158 }
7159 if (width < len)
7160 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007161 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 reslen -= rescnt;
7163 rescnt = width + fmtcnt + 100;
7164 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007165 if (reslen < 0) {
7166 Py_DECREF(result);
7167 return PyErr_NoMemory();
7168 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007169 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 return NULL;
7171 res = PyUnicode_AS_UNICODE(result)
7172 + reslen - rescnt;
7173 }
7174 if (sign) {
7175 if (fill != ' ')
7176 *res++ = sign;
7177 rescnt--;
7178 if (width > len)
7179 width--;
7180 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007181 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7182 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007183 assert(pbuf[1] == c);
7184 if (fill != ' ') {
7185 *res++ = *pbuf++;
7186 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007187 }
Tim Petersfff53252001-04-12 18:38:48 +00007188 rescnt -= 2;
7189 width -= 2;
7190 if (width < 0)
7191 width = 0;
7192 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194 if (width > len && !(flags & F_LJUST)) {
7195 do {
7196 --rescnt;
7197 *res++ = fill;
7198 } while (--width > len);
7199 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007200 if (fill == ' ') {
7201 if (sign)
7202 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007203 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007204 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007205 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007206 *res++ = *pbuf++;
7207 *res++ = *pbuf++;
7208 }
7209 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007210 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 res += len;
7212 rescnt -= len;
7213 while (--width >= len) {
7214 --rescnt;
7215 *res++ = ' ';
7216 }
7217 if (dict && (argidx < arglen) && c != '%') {
7218 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007219 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 goto onError;
7221 }
7222 Py_XDECREF(temp);
7223 } /* '%' */
7224 } /* until end */
7225 if (argidx < arglen && !dict) {
7226 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007227 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228 goto onError;
7229 }
7230
7231 if (args_owned) {
7232 Py_DECREF(args);
7233 }
7234 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007235 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007236 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 return (PyObject *)result;
7238
7239 onError:
7240 Py_XDECREF(result);
7241 Py_DECREF(uformat);
7242 if (args_owned) {
7243 Py_DECREF(args);
7244 }
7245 return NULL;
7246}
7247
7248static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007249 (readbufferproc) unicode_buffer_getreadbuf,
7250 (writebufferproc) unicode_buffer_getwritebuf,
7251 (segcountproc) unicode_buffer_getsegcount,
7252 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253};
7254
Jeremy Hylton938ace62002-07-17 16:30:39 +00007255static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007256unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7257
Tim Peters6d6c1a32001-08-02 04:15:00 +00007258static PyObject *
7259unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7260{
7261 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007262 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007263 char *encoding = NULL;
7264 char *errors = NULL;
7265
Guido van Rossume023fe02001-08-30 03:12:59 +00007266 if (type != &PyUnicode_Type)
7267 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007268 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7269 kwlist, &x, &encoding, &errors))
7270 return NULL;
7271 if (x == NULL)
7272 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007273 if (encoding == NULL && errors == NULL)
7274 return PyObject_Unicode(x);
7275 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007276 return PyUnicode_FromEncodedObject(x, encoding, errors);
7277}
7278
Guido van Rossume023fe02001-08-30 03:12:59 +00007279static PyObject *
7280unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7281{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007282 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007283 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007284
7285 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7286 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7287 if (tmp == NULL)
7288 return NULL;
7289 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007290 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007291 if (pnew == NULL) {
7292 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007293 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007294 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007295 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7296 if (pnew->str == NULL) {
7297 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007298 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007299 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007300 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007301 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007302 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7303 pnew->length = n;
7304 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007305 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007306 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007307}
7308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007309PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007310"unicode(string [, encoding[, errors]]) -> object\n\
7311\n\
7312Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007313encoding defaults to the current default string encoding.\n\
7314errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007315
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316PyTypeObject PyUnicode_Type = {
7317 PyObject_HEAD_INIT(&PyType_Type)
7318 0, /* ob_size */
7319 "unicode", /* tp_name */
7320 sizeof(PyUnicodeObject), /* tp_size */
7321 0, /* tp_itemsize */
7322 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007323 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007325 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326 0, /* tp_setattr */
7327 (cmpfunc) unicode_compare, /* tp_compare */
7328 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007329 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007331 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 (hashfunc) unicode_hash, /* tp_hash*/
7333 0, /* tp_call*/
7334 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007335 PyObject_GenericGetAttr, /* tp_getattro */
7336 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007338 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7339 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007340 unicode_doc, /* tp_doc */
7341 0, /* tp_traverse */
7342 0, /* tp_clear */
7343 0, /* tp_richcompare */
7344 0, /* tp_weaklistoffset */
7345 0, /* tp_iter */
7346 0, /* tp_iternext */
7347 unicode_methods, /* tp_methods */
7348 0, /* tp_members */
7349 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007350 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007351 0, /* tp_dict */
7352 0, /* tp_descr_get */
7353 0, /* tp_descr_set */
7354 0, /* tp_dictoffset */
7355 0, /* tp_init */
7356 0, /* tp_alloc */
7357 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007358 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359};
7360
7361/* Initialize the Unicode implementation */
7362
Thomas Wouters78890102000-07-22 19:25:51 +00007363void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007365 int i;
7366
Fred Drakee4315f52000-05-09 19:53:39 +00007367 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007368 unicode_freelist = NULL;
7369 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007371 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007372 for (i = 0; i < 256; i++)
7373 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007374 if (PyType_Ready(&PyUnicode_Type) < 0)
7375 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376}
7377
7378/* Finalize the Unicode implementation */
7379
7380void
Thomas Wouters78890102000-07-22 19:25:51 +00007381_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007383 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007384 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007386 Py_XDECREF(unicode_empty);
7387 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007389 for (i = 0; i < 256; i++) {
7390 if (unicode_latin1[i]) {
7391 Py_DECREF(unicode_latin1[i]);
7392 unicode_latin1[i] = NULL;
7393 }
7394 }
7395
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007396 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 PyUnicodeObject *v = u;
7398 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007399 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007400 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007401 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007402 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007404 unicode_freelist = NULL;
7405 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007407
7408/*
7409Local variables:
7410c-basic-offset: 4
7411indent-tabs-mode: nil
7412End:
7413*/