blob: b044d68ab72415181fbabe931f9de94914b3d3ac [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000122 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000157 assert(length < INT_MAX);
158 unicode->length = (int)length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000167
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 return 0;
169}
170
171/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000172 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000173
174 XXX This allocator could further be enhanced by assuring that the
175 free list never reduces its size below 1.
176
177*/
178
179static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000180PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181{
182 register PyUnicodeObject *unicode;
183
Tim Petersced69f82003-09-16 20:30:58 +0000184 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 if (length == 0 && unicode_empty != NULL) {
186 Py_INCREF(unicode_empty);
187 return unicode_empty;
188 }
189
190 /* Unicode freelist & memory allocation */
191 if (unicode_freelist) {
192 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000193 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000195 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000196 /* Keep-Alive optimization: we only upsize the buffer,
197 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000198 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000199 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000200 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000204 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000206 }
207 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 }
209 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000210 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 if (unicode == NULL)
212 return NULL;
213 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
214 }
215
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000216 if (!unicode->str) {
217 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000218 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000219 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000220 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000221 * the caller fails before initializing str -- unicode_resize()
222 * reads str[0], and the Keep-Alive optimization can keep memory
223 * allocated for str alive across a call to unicode_dealloc(unicode).
224 * We don't want unicode_resize to read uninitialized memory in
225 * that case.
226 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000227 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str[length] = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000229 assert(length<INT_MAX);
230 unicode->length = (int)length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000232 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000234
235 onError:
236 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000237 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239}
240
241static
Guido van Rossum9475a232001-10-05 20:51:39 +0000242void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000244 if (PyUnicode_CheckExact(unicode) &&
245 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000246 /* Keep-Alive optimization */
247 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 unicode->str = NULL;
250 unicode->length = 0;
251 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000252 if (unicode->defenc) {
253 Py_DECREF(unicode->defenc);
254 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000255 }
256 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 *(PyUnicodeObject **)unicode = unicode_freelist;
258 unicode_freelist = unicode;
259 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000262 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000263 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000264 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 }
266}
267
Martin v. Löwis18e16552006-02-15 17:27:45 +0000268int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269{
270 register PyUnicodeObject *v;
271
272 /* Argument checks */
273 if (unicode == NULL) {
274 PyErr_BadInternalCall();
275 return -1;
276 }
277 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000278 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279 PyErr_BadInternalCall();
280 return -1;
281 }
282
283 /* Resizing unicode_empty and single character objects is not
284 possible since these are being shared. We simply return a fresh
285 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000286 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000287 (v == unicode_empty || v->length == 1)) {
288 PyUnicodeObject *w = _PyUnicode_New(length);
289 if (w == NULL)
290 return -1;
291 Py_UNICODE_COPY(w->str, v->str,
292 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000293 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000294 *unicode = (PyObject *)w;
295 return 0;
296 }
297
298 /* Note that we don't have to modify *unicode for unshared Unicode
299 objects, since we can modify them in-place. */
300 return unicode_resize(v, length);
301}
302
303/* Internal API for use in unicodeobject.c only ! */
304#define _PyUnicode_Resize(unicodevar, length) \
305 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000308 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309{
310 PyUnicodeObject *unicode;
311
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 /* If the Unicode data is known at construction time, we can apply
313 some optimizations which share commonly used objects. */
314 if (u != NULL) {
315
316 /* Optimization for empty strings */
317 if (size == 0 && unicode_empty != NULL) {
318 Py_INCREF(unicode_empty);
319 return (PyObject *)unicode_empty;
320 }
321
322 /* Single character Unicode objects in the Latin-1 range are
323 shared when using this constructor */
324 if (size == 1 && *u < 256) {
325 unicode = unicode_latin1[*u];
326 if (!unicode) {
327 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 if (!unicode)
329 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000330 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000331 unicode_latin1[*u] = unicode;
332 }
333 Py_INCREF(unicode);
334 return (PyObject *)unicode;
335 }
336 }
Tim Petersced69f82003-09-16 20:30:58 +0000337
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 unicode = _PyUnicode_New(size);
339 if (!unicode)
340 return NULL;
341
342 /* Copy the Unicode data into the new object */
343 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000344 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345
346 return (PyObject *)unicode;
347}
348
349#ifdef HAVE_WCHAR_H
350
351PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000352 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353{
354 PyUnicodeObject *unicode;
355
356 if (w == NULL) {
357 PyErr_BadInternalCall();
358 return NULL;
359 }
360
361 unicode = _PyUnicode_New(size);
362 if (!unicode)
363 return NULL;
364
365 /* Copy the wchar_t data into the new object */
366#ifdef HAVE_USABLE_WCHAR_T
367 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000368#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 {
370 register Py_UNICODE *u;
371 register int i;
372 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000373 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 *u++ = *w++;
375 }
376#endif
377
378 return (PyObject *)unicode;
379}
380
Martin v. Löwis18e16552006-02-15 17:27:45 +0000381Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
382 wchar_t *w,
383 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384{
385 if (unicode == NULL) {
386 PyErr_BadInternalCall();
387 return -1;
388 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000389
390 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000392 size = PyUnicode_GET_SIZE(unicode) + 1;
393
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394#ifdef HAVE_USABLE_WCHAR_T
395 memcpy(w, unicode->str, size * sizeof(wchar_t));
396#else
397 {
398 register Py_UNICODE *u;
399 register int i;
400 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000401 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 *w++ = *u++;
403 }
404#endif
405
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000406 if (size > PyUnicode_GET_SIZE(unicode))
407 return PyUnicode_GET_SIZE(unicode);
408 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000409 return size;
410}
411
412#endif
413
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000414PyObject *PyUnicode_FromOrdinal(int ordinal)
415{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000416 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000417
418#ifdef Py_UNICODE_WIDE
419 if (ordinal < 0 || ordinal > 0x10ffff) {
420 PyErr_SetString(PyExc_ValueError,
421 "unichr() arg not in range(0x110000) "
422 "(wide Python build)");
423 return NULL;
424 }
425#else
426 if (ordinal < 0 || ordinal > 0xffff) {
427 PyErr_SetString(PyExc_ValueError,
428 "unichr() arg not in range(0x10000) "
429 "(narrow Python build)");
430 return NULL;
431 }
432#endif
433
Hye-Shik Chang40574832004-04-06 07:24:51 +0000434 s[0] = (Py_UNICODE)ordinal;
435 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000436}
437
Guido van Rossumd57fd912000-03-10 22:53:23 +0000438PyObject *PyUnicode_FromObject(register PyObject *obj)
439{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 /* XXX Perhaps we should make this API an alias of
441 PyObject_Unicode() instead ?! */
442 if (PyUnicode_CheckExact(obj)) {
443 Py_INCREF(obj);
444 return obj;
445 }
446 if (PyUnicode_Check(obj)) {
447 /* For a Unicode subtype that's not a Unicode object,
448 return a true Unicode object with the same data. */
449 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
450 PyUnicode_GET_SIZE(obj));
451 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000452 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
453}
454
455PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
456 const char *encoding,
457 const char *errors)
458{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000460 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000461 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000462
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 if (obj == NULL) {
464 PyErr_BadInternalCall();
465 return NULL;
466 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000468#if 0
469 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000470 that no encodings is given and then redirect to
471 PyObject_Unicode() which then applies the additional logic for
472 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000473
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000474 NOTE: This API should really only be used for object which
475 represent *encoded* Unicode !
476
477 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000478 if (PyUnicode_Check(obj)) {
479 if (encoding) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000485 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000486#else
487 if (PyUnicode_Check(obj)) {
488 PyErr_SetString(PyExc_TypeError,
489 "decoding Unicode is not supported");
490 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000491 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000492#endif
493
494 /* Coerce object */
495 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000496 s = PyString_AS_STRING(obj);
497 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000498 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
500 /* Overwrite the error message with something more useful in
501 case of a TypeError. */
502 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000503 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000504 "coercing to Unicode: need string or buffer, "
505 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000506 obj->ob_type->tp_name);
507 goto onError;
508 }
Tim Petersced69f82003-09-16 20:30:58 +0000509
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000510 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000511 if (len == 0) {
512 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000513 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 }
Tim Petersced69f82003-09-16 20:30:58 +0000515 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000517
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518 return v;
519
520 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522}
523
524PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000525 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000526 const char *encoding,
527 const char *errors)
528{
529 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000530
531 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000532 encoding = PyUnicode_GetDefaultEncoding();
533
534 /* Shortcuts for common default encodings */
535 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000536 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000537 else if (strcmp(encoding, "latin-1") == 0)
538 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000539#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
540 else if (strcmp(encoding, "mbcs") == 0)
541 return PyUnicode_DecodeMBCS(s, size, errors);
542#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000543 else if (strcmp(encoding, "ascii") == 0)
544 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000545
546 /* Decode via the codec registry */
547 buffer = PyBuffer_FromMemory((void *)s, size);
548 if (buffer == NULL)
549 goto onError;
550 unicode = PyCodec_Decode(buffer, encoding, errors);
551 if (unicode == NULL)
552 goto onError;
553 if (!PyUnicode_Check(unicode)) {
554 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000555 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000556 unicode->ob_type->tp_name);
557 Py_DECREF(unicode);
558 goto onError;
559 }
560 Py_DECREF(buffer);
561 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000562
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 onError:
564 Py_XDECREF(buffer);
565 return NULL;
566}
567
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000568PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
569 const char *encoding,
570 const char *errors)
571{
572 PyObject *v;
573
574 if (!PyUnicode_Check(unicode)) {
575 PyErr_BadArgument();
576 goto onError;
577 }
578
579 if (encoding == NULL)
580 encoding = PyUnicode_GetDefaultEncoding();
581
582 /* Decode via the codec registry */
583 v = PyCodec_Decode(unicode, encoding, errors);
584 if (v == NULL)
585 goto onError;
586 return v;
587
588 onError:
589 return NULL;
590}
591
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000593 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000594 const char *encoding,
595 const char *errors)
596{
597 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000598
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599 unicode = PyUnicode_FromUnicode(s, size);
600 if (unicode == NULL)
601 return NULL;
602 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
603 Py_DECREF(unicode);
604 return v;
605}
606
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000607PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
608 const char *encoding,
609 const char *errors)
610{
611 PyObject *v;
612
613 if (!PyUnicode_Check(unicode)) {
614 PyErr_BadArgument();
615 goto onError;
616 }
617
618 if (encoding == NULL)
619 encoding = PyUnicode_GetDefaultEncoding();
620
621 /* Encode via the codec registry */
622 v = PyCodec_Encode(unicode, encoding, errors);
623 if (v == NULL)
624 goto onError;
625 return v;
626
627 onError:
628 return NULL;
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
632 const char *encoding,
633 const char *errors)
634{
635 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637 if (!PyUnicode_Check(unicode)) {
638 PyErr_BadArgument();
639 goto onError;
640 }
Fred Drakee4315f52000-05-09 19:53:39 +0000641
Tim Petersced69f82003-09-16 20:30:58 +0000642 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000643 encoding = PyUnicode_GetDefaultEncoding();
644
645 /* Shortcuts for common default encodings */
646 if (errors == NULL) {
647 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000648 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000649 else if (strcmp(encoding, "latin-1") == 0)
650 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000651#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
652 else if (strcmp(encoding, "mbcs") == 0)
653 return PyUnicode_AsMBCSString(unicode);
654#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000655 else if (strcmp(encoding, "ascii") == 0)
656 return PyUnicode_AsASCIIString(unicode);
657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658
659 /* Encode via the codec registry */
660 v = PyCodec_Encode(unicode, encoding, errors);
661 if (v == NULL)
662 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 if (!PyString_Check(v)) {
664 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000665 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 v->ob_type->tp_name);
667 Py_DECREF(v);
668 goto onError;
669 }
670 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000671
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 onError:
673 return NULL;
674}
675
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000676PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
677 const char *errors)
678{
679 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
680
681 if (v)
682 return v;
683 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
684 if (v && errors == NULL)
685 ((PyUnicodeObject *)unicode)->defenc = v;
686 return v;
687}
688
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
690{
691 if (!PyUnicode_Check(unicode)) {
692 PyErr_BadArgument();
693 goto onError;
694 }
695 return PyUnicode_AS_UNICODE(unicode);
696
697 onError:
698 return NULL;
699}
700
Martin v. Löwis18e16552006-02-15 17:27:45 +0000701Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000702{
703 if (!PyUnicode_Check(unicode)) {
704 PyErr_BadArgument();
705 goto onError;
706 }
707 return PyUnicode_GET_SIZE(unicode);
708
709 onError:
710 return -1;
711}
712
Thomas Wouters78890102000-07-22 19:25:51 +0000713const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000714{
715 return unicode_default_encoding;
716}
717
718int PyUnicode_SetDefaultEncoding(const char *encoding)
719{
720 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000721
Fred Drakee4315f52000-05-09 19:53:39 +0000722 /* Make sure the encoding is valid. As side effect, this also
723 loads the encoding into the codec registry cache. */
724 v = _PyCodec_Lookup(encoding);
725 if (v == NULL)
726 goto onError;
727 Py_DECREF(v);
728 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000729 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000730 sizeof(unicode_default_encoding));
731 return 0;
732
733 onError:
734 return -1;
735}
736
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000737/* error handling callback helper:
738 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000739 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000740 and adjust various state variables.
741 return 0 on success, -1 on error
742*/
743
744static
745int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
746 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000747 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
748 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000749{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000750 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000751
752 PyObject *restuple = NULL;
753 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000754 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
755 Py_ssize_t requiredsize;
756 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000758 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000759 int res = -1;
760
761 if (*errorHandler == NULL) {
762 *errorHandler = PyCodec_LookupError(errors);
763 if (*errorHandler == NULL)
764 goto onError;
765 }
766
767 if (*exceptionObject == NULL) {
768 *exceptionObject = PyUnicodeDecodeError_Create(
769 encoding, input, insize, *startinpos, *endinpos, reason);
770 if (*exceptionObject == NULL)
771 goto onError;
772 }
773 else {
774 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
777 goto onError;
778 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
779 goto onError;
780 }
781
782 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
783 if (restuple == NULL)
784 goto onError;
785 if (!PyTuple_Check(restuple)) {
786 PyErr_Format(PyExc_TypeError, &argparse[4]);
787 goto onError;
788 }
789 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
790 goto onError;
791 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000792 newpos = insize+newpos;
793 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000794 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000795 goto onError;
796 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000797
798 /* need more space? (at least enough for what we
799 have+the replacement+the rest of the string (starting
800 at the new input position), so we won't have to check space
801 when there are no errors in the rest of the string) */
802 repptr = PyUnicode_AS_UNICODE(repunicode);
803 repsize = PyUnicode_GET_SIZE(repunicode);
804 requiredsize = *outpos + repsize + insize-newpos;
805 if (requiredsize > outsize) {
806 if (requiredsize<2*outsize)
807 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000808 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809 goto onError;
810 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
811 }
812 *endinpos = newpos;
813 *inptr = input + newpos;
814 Py_UNICODE_COPY(*outptr, repptr, repsize);
815 *outptr += repsize;
816 *outpos += repsize;
817 /* we made it! */
818 res = 0;
819
820 onError:
821 Py_XDECREF(restuple);
822 return res;
823}
824
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000825/* --- UTF-7 Codec -------------------------------------------------------- */
826
827/* see RFC2152 for details */
828
Tim Petersced69f82003-09-16 20:30:58 +0000829static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000830char utf7_special[128] = {
831 /* indicate whether a UTF-7 character is special i.e. cannot be directly
832 encoded:
833 0 - not special
834 1 - special
835 2 - whitespace (optional)
836 3 - RFC2152 Set O (optional) */
837 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
838 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
839 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
843 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
844 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
845
846};
847
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000848/* Note: The comparison (c) <= 0 is a trick to work-around gcc
849 warnings about the comparison always being false; since
850 utf7_special[0] is 1, we can safely make that one comparison
851 true */
852
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000853#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000854 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000855 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000856 (encodeO && (utf7_special[(c)] == 3)))
857
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000858#define B64(n) \
859 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
860#define B64CHAR(c) \
861 (isalnum(c) || (c) == '+' || (c) == '/')
862#define UB64(c) \
863 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
864 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000865
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000866#define ENCODE(out, ch, bits) \
867 while (bits >= 6) { \
868 *out++ = B64(ch >> (bits-6)); \
869 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000870 }
871
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000872#define DECODE(out, ch, bits, surrogate) \
873 while (bits >= 16) { \
874 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
875 bits -= 16; \
876 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000877 /* We have already generated an error for the high surrogate \
878 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000879 surrogate = 0; \
880 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000882 it in a 16-bit character */ \
883 surrogate = 1; \
884 errmsg = "code pairs are not supported"; \
885 goto utf7Error; \
886 } else { \
887 *out++ = outCh; \
888 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000889 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000890
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000891PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000892 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000893 const char *errors)
894{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000895 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000896 Py_ssize_t startinpos;
897 Py_ssize_t endinpos;
898 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000899 const char *e;
900 PyUnicodeObject *unicode;
901 Py_UNICODE *p;
902 const char *errmsg = "";
903 int inShift = 0;
904 unsigned int bitsleft = 0;
905 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000906 int surrogate = 0;
907 PyObject *errorHandler = NULL;
908 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000909
910 unicode = _PyUnicode_New(size);
911 if (!unicode)
912 return NULL;
913 if (size == 0)
914 return (PyObject *)unicode;
915
916 p = unicode->str;
917 e = s + size;
918
919 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000920 Py_UNICODE ch;
921 restart:
922 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000923
924 if (inShift) {
925 if ((ch == '-') || !B64CHAR(ch)) {
926 inShift = 0;
927 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000928
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000929 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
930 if (bitsleft >= 6) {
931 /* The shift sequence has a partial character in it. If
932 bitsleft < 6 then we could just classify it as padding
933 but that is not the case here */
934
935 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000936 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 }
938 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000939 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000940 here so indicate the potential of a misencoded character. */
941
942 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
943 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
944 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000945 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946 }
947
948 if (ch == '-') {
949 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000950 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951 inShift = 1;
952 }
953 } else if (SPECIAL(ch,0,0)) {
954 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000955 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000956 } else {
957 *p++ = ch;
958 }
959 } else {
960 charsleft = (charsleft << 6) | UB64(ch);
961 bitsleft += 6;
962 s++;
963 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
964 }
965 }
966 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000967 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000968 s++;
969 if (s < e && *s == '-') {
970 s++;
971 *p++ = '+';
972 } else
973 {
974 inShift = 1;
975 bitsleft = 0;
976 }
977 }
978 else if (SPECIAL(ch,0,0)) {
979 errmsg = "unexpected special character";
980 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000981 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000982 }
983 else {
984 *p++ = ch;
985 s++;
986 }
987 continue;
988 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000989 outpos = p-PyUnicode_AS_UNICODE(unicode);
990 endinpos = s-starts;
991 if (unicode_decode_call_errorhandler(
992 errors, &errorHandler,
993 "utf7", errmsg,
994 starts, size, &startinpos, &endinpos, &exc, &s,
995 (PyObject **)&unicode, &outpos, &p))
996 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001000 outpos = p-PyUnicode_AS_UNICODE(unicode);
1001 endinpos = size;
1002 if (unicode_decode_call_errorhandler(
1003 errors, &errorHandler,
1004 "utf7", "unterminated shift sequence",
1005 starts, size, &startinpos, &endinpos, &exc, &s,
1006 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001008 if (s < e)
1009 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001010 }
1011
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001012 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001013 goto onError;
1014
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001015 Py_XDECREF(errorHandler);
1016 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001017 return (PyObject *)unicode;
1018
1019onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001020 Py_XDECREF(errorHandler);
1021 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001022 Py_DECREF(unicode);
1023 return NULL;
1024}
1025
1026
1027PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001028 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001029 int encodeSetO,
1030 int encodeWhiteSpace,
1031 const char *errors)
1032{
1033 PyObject *v;
1034 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001035 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001036 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001037 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 unsigned int bitsleft = 0;
1039 unsigned long charsleft = 0;
1040 char * out;
1041 char * start;
1042
1043 if (size == 0)
1044 return PyString_FromStringAndSize(NULL, 0);
1045
1046 v = PyString_FromStringAndSize(NULL, cbAllocated);
1047 if (v == NULL)
1048 return NULL;
1049
1050 start = out = PyString_AS_STRING(v);
1051 for (;i < size; ++i) {
1052 Py_UNICODE ch = s[i];
1053
1054 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001055 if (ch == '+') {
1056 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001057 *out++ = '-';
1058 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1059 charsleft = ch;
1060 bitsleft = 16;
1061 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001062 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001064 } else {
1065 *out++ = (char) ch;
1066 }
1067 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1069 *out++ = B64(charsleft << (6-bitsleft));
1070 charsleft = 0;
1071 bitsleft = 0;
1072 /* Characters not in the BASE64 set implicitly unshift the sequence
1073 so no '-' is required, except if the character is itself a '-' */
1074 if (B64CHAR(ch) || ch == '-') {
1075 *out++ = '-';
1076 }
1077 inShift = 0;
1078 *out++ = (char) ch;
1079 } else {
1080 bitsleft += 16;
1081 charsleft = (charsleft << 16) | ch;
1082 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1083
1084 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001085 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001086 or '-' then the shift sequence will be terminated implicitly and we
1087 don't have to insert a '-'. */
1088
1089 if (bitsleft == 0) {
1090 if (i + 1 < size) {
1091 Py_UNICODE ch2 = s[i+1];
1092
1093 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001094
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001095 } else if (B64CHAR(ch2) || ch2 == '-') {
1096 *out++ = '-';
1097 inShift = 0;
1098 } else {
1099 inShift = 0;
1100 }
1101
1102 }
1103 else {
1104 *out++ = '-';
1105 inShift = 0;
1106 }
1107 }
Tim Petersced69f82003-09-16 20:30:58 +00001108 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001110 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001111 if (bitsleft) {
1112 *out++= B64(charsleft << (6-bitsleft) );
1113 *out++ = '-';
1114 }
1115
Tim Peters5de98422002-04-27 18:44:32 +00001116 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001117 return v;
1118}
1119
1120#undef SPECIAL
1121#undef B64
1122#undef B64CHAR
1123#undef UB64
1124#undef ENCODE
1125#undef DECODE
1126
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127/* --- UTF-8 Codec -------------------------------------------------------- */
1128
Tim Petersced69f82003-09-16 20:30:58 +00001129static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130char utf8_code_length[256] = {
1131 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1132 illegal prefix. see RFC 2279 for details */
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1140 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1145 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1146 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1147 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1148 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1149};
1150
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001152 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 const char *errors)
1154{
Walter Dörwald69652032004-09-07 20:24:22 +00001155 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1156}
1157
1158PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001159 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001160 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001161 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001162{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001163 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001165 Py_ssize_t startinpos;
1166 Py_ssize_t endinpos;
1167 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 const char *e;
1169 PyUnicodeObject *unicode;
1170 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001172 PyObject *errorHandler = NULL;
1173 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
1175 /* Note: size will always be longer than the resulting Unicode
1176 character count */
1177 unicode = _PyUnicode_New(size);
1178 if (!unicode)
1179 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001180 if (size == 0) {
1181 if (consumed)
1182 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185
1186 /* Unpack UTF-8 encoded data */
1187 p = unicode->str;
1188 e = s + size;
1189
1190 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001191 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192
1193 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001194 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 s++;
1196 continue;
1197 }
1198
1199 n = utf8_code_length[ch];
1200
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001201 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001202 if (consumed)
1203 break;
1204 else {
1205 errmsg = "unexpected end of data";
1206 startinpos = s-starts;
1207 endinpos = size;
1208 goto utf8Error;
1209 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211
1212 switch (n) {
1213
1214 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001216 startinpos = s-starts;
1217 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001218 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219
1220 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001221 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001222 startinpos = s-starts;
1223 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 if ((s[1] & 0xc0) != 0x80) {
1228 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001229 startinpos = s-starts;
1230 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001231 goto utf8Error;
1232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001234 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001235 startinpos = s-starts;
1236 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001237 errmsg = "illegal encoding";
1238 goto utf8Error;
1239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001241 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 break;
1243
1244 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001245 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001246 (s[2] & 0xc0) != 0x80) {
1247 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248 startinpos = s-starts;
1249 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001250 goto utf8Error;
1251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001253 if (ch < 0x0800) {
1254 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001255 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001256
1257 XXX For wide builds (UCS-4) we should probably try
1258 to recombine the surrogates into a single code
1259 unit.
1260 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262 startinpos = s-starts;
1263 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001264 goto utf8Error;
1265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001266 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001267 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001268 break;
1269
1270 case 4:
1271 if ((s[1] & 0xc0) != 0x80 ||
1272 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001273 (s[3] & 0xc0) != 0x80) {
1274 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001275 startinpos = s-starts;
1276 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 goto utf8Error;
1278 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001279 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1280 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1281 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001282 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001283 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001284 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001285 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001286 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288 startinpos = s-starts;
1289 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 goto utf8Error;
1291 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001292#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001293 *p++ = (Py_UNICODE)ch;
1294#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001295 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001296
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001297 /* translate from 10000..10FFFF to 0..FFFF */
1298 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001299
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001300 /* high surrogate = top 10 bits added to D800 */
1301 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001302
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001303 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001304 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001305#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 break;
1307
1308 default:
1309 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001310 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001311 startinpos = s-starts;
1312 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001313 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 }
1315 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001317
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001318 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001319 outpos = p-PyUnicode_AS_UNICODE(unicode);
1320 if (unicode_decode_call_errorhandler(
1321 errors, &errorHandler,
1322 "utf8", errmsg,
1323 starts, size, &startinpos, &endinpos, &exc, &s,
1324 (PyObject **)&unicode, &outpos, &p))
1325 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326 }
Walter Dörwald69652032004-09-07 20:24:22 +00001327 if (consumed)
1328 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329
1330 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001331 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332 goto onError;
1333
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334 Py_XDECREF(errorHandler);
1335 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336 return (PyObject *)unicode;
1337
1338onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 Py_XDECREF(errorHandler);
1340 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 Py_DECREF(unicode);
1342 return NULL;
1343}
1344
Tim Peters602f7402002-04-27 18:03:26 +00001345/* Allocation strategy: if the string is short, convert into a stack buffer
1346 and allocate exactly as much space needed at the end. Else allocate the
1347 maximum possible needed (4 result bytes per Unicode character), and return
1348 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001349*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001350PyObject *
1351PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001352 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354{
Tim Peters602f7402002-04-27 18:03:26 +00001355#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001356
Martin v. Löwis18e16552006-02-15 17:27:45 +00001357 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001358 PyObject *v; /* result string object */
1359 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001360 Py_ssize_t nallocated; /* number of result bytes allocated */
Tim Peters602f7402002-04-27 18:03:26 +00001361 int nneeded; /* number of result bytes needed */
1362 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001363
Tim Peters602f7402002-04-27 18:03:26 +00001364 assert(s != NULL);
1365 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366
Tim Peters602f7402002-04-27 18:03:26 +00001367 if (size <= MAX_SHORT_UNICHARS) {
1368 /* Write into the stack buffer; nallocated can't overflow.
1369 * At the end, we'll allocate exactly as much heap space as it
1370 * turns out we need.
1371 */
1372 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1373 v = NULL; /* will allocate after we're done */
1374 p = stackbuf;
1375 }
1376 else {
1377 /* Overallocate on the heap, and give the excess back at the end. */
1378 nallocated = size * 4;
1379 if (nallocated / 4 != size) /* overflow! */
1380 return PyErr_NoMemory();
1381 v = PyString_FromStringAndSize(NULL, nallocated);
1382 if (v == NULL)
1383 return NULL;
1384 p = PyString_AS_STRING(v);
1385 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001386
Tim Peters602f7402002-04-27 18:03:26 +00001387 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001388 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001389
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001390 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001391 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001393
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001395 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001396 *p++ = (char)(0xc0 | (ch >> 6));
1397 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001398 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001399 else {
Tim Peters602f7402002-04-27 18:03:26 +00001400 /* Encode UCS2 Unicode ordinals */
1401 if (ch < 0x10000) {
1402 /* Special case: check for high surrogate */
1403 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1404 Py_UCS4 ch2 = s[i];
1405 /* Check for low surrogate and combine the two to
1406 form a UCS4 value */
1407 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001408 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001409 i++;
1410 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001411 }
Tim Peters602f7402002-04-27 18:03:26 +00001412 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001413 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001414 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001415 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1416 *p++ = (char)(0x80 | (ch & 0x3f));
1417 continue;
1418 }
1419encodeUCS4:
1420 /* Encode UCS4 Unicode ordinals */
1421 *p++ = (char)(0xf0 | (ch >> 18));
1422 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1423 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1424 *p++ = (char)(0x80 | (ch & 0x3f));
1425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001427
Tim Peters602f7402002-04-27 18:03:26 +00001428 if (v == NULL) {
1429 /* This was stack allocated. */
1430 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1431 assert(nneeded <= nallocated);
1432 v = PyString_FromStringAndSize(stackbuf, nneeded);
1433 }
1434 else {
1435 /* Cut back to size actually needed. */
1436 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1437 assert(nneeded <= nallocated);
1438 _PyString_Resize(&v, nneeded);
1439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441
Tim Peters602f7402002-04-27 18:03:26 +00001442#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443}
1444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1446{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 if (!PyUnicode_Check(unicode)) {
1448 PyErr_BadArgument();
1449 return NULL;
1450 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001451 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1452 PyUnicode_GET_SIZE(unicode),
1453 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454}
1455
1456/* --- UTF-16 Codec ------------------------------------------------------- */
1457
Tim Peters772747b2001-08-09 22:21:55 +00001458PyObject *
1459PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001460 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001461 const char *errors,
1462 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463{
Walter Dörwald69652032004-09-07 20:24:22 +00001464 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1465}
1466
1467PyObject *
1468PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001469 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001470 const char *errors,
1471 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001472 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001473{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001474 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t startinpos;
1476 Py_ssize_t endinpos;
1477 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478 PyUnicodeObject *unicode;
1479 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001480 const unsigned char *q, *e;
1481 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001482 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001483 /* Offsets from q for retrieving byte pairs in the right order. */
1484#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1485 int ihi = 1, ilo = 0;
1486#else
1487 int ihi = 0, ilo = 1;
1488#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001489 PyObject *errorHandler = NULL;
1490 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491
1492 /* Note: size will always be longer than the resulting Unicode
1493 character count */
1494 unicode = _PyUnicode_New(size);
1495 if (!unicode)
1496 return NULL;
1497 if (size == 0)
1498 return (PyObject *)unicode;
1499
1500 /* Unpack UTF-16 encoded data */
1501 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001502 q = (unsigned char *)s;
1503 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504
1505 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001506 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001508 /* Check for BOM marks (U+FEFF) in the input and adjust current
1509 byte order setting accordingly. In native mode, the leading BOM
1510 mark is skipped, in all other modes, it is copied to the output
1511 stream as-is (giving a ZWNBSP character). */
1512 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001513 if (size >= 2) {
1514 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001515#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001516 if (bom == 0xFEFF) {
1517 q += 2;
1518 bo = -1;
1519 }
1520 else if (bom == 0xFFFE) {
1521 q += 2;
1522 bo = 1;
1523 }
Tim Petersced69f82003-09-16 20:30:58 +00001524#else
Walter Dörwald69652032004-09-07 20:24:22 +00001525 if (bom == 0xFEFF) {
1526 q += 2;
1527 bo = 1;
1528 }
1529 else if (bom == 0xFFFE) {
1530 q += 2;
1531 bo = -1;
1532 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001533#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001534 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536
Tim Peters772747b2001-08-09 22:21:55 +00001537 if (bo == -1) {
1538 /* force LE */
1539 ihi = 1;
1540 ilo = 0;
1541 }
1542 else if (bo == 1) {
1543 /* force BE */
1544 ihi = 0;
1545 ilo = 1;
1546 }
1547
1548 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001550 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001552 if (consumed)
1553 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001554 errmsg = "truncated data";
1555 startinpos = ((const char *)q)-starts;
1556 endinpos = ((const char *)e)-starts;
1557 goto utf16Error;
1558 /* The remaining input chars are ignored if the callback
1559 chooses to skip the input */
1560 }
1561 ch = (q[ihi] << 8) | q[ilo];
1562
Tim Peters772747b2001-08-09 22:21:55 +00001563 q += 2;
1564
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 if (ch < 0xD800 || ch > 0xDFFF) {
1566 *p++ = ch;
1567 continue;
1568 }
1569
1570 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001571 if (q >= e) {
1572 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001573 startinpos = (((const char *)q)-2)-starts;
1574 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001575 goto utf16Error;
1576 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001577 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001578 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1579 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001580 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001581#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001582 *p++ = ch;
1583 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001584#else
1585 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001586#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001587 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001588 }
1589 else {
1590 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001591 startinpos = (((const char *)q)-4)-starts;
1592 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001593 goto utf16Error;
1594 }
1595
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001597 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001598 startinpos = (((const char *)q)-2)-starts;
1599 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001600 /* Fall through to report the error */
1601
1602 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001603 outpos = p-PyUnicode_AS_UNICODE(unicode);
1604 if (unicode_decode_call_errorhandler(
1605 errors, &errorHandler,
1606 "utf16", errmsg,
1607 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1608 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001609 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 }
1611
1612 if (byteorder)
1613 *byteorder = bo;
1614
Walter Dörwald69652032004-09-07 20:24:22 +00001615 if (consumed)
1616 *consumed = (const char *)q-starts;
1617
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001619 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 goto onError;
1621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 Py_XDECREF(errorHandler);
1623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624 return (PyObject *)unicode;
1625
1626onError:
1627 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 Py_XDECREF(errorHandler);
1629 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 return NULL;
1631}
1632
Tim Peters772747b2001-08-09 22:21:55 +00001633PyObject *
1634PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001635 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001636 const char *errors,
1637 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638{
1639 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001640 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001641#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001642 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001643#else
1644 const int pairs = 0;
1645#endif
Tim Peters772747b2001-08-09 22:21:55 +00001646 /* Offsets from p for storing byte pairs in the right order. */
1647#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1648 int ihi = 1, ilo = 0;
1649#else
1650 int ihi = 0, ilo = 1;
1651#endif
1652
1653#define STORECHAR(CH) \
1654 do { \
1655 p[ihi] = ((CH) >> 8) & 0xff; \
1656 p[ilo] = (CH) & 0xff; \
1657 p += 2; \
1658 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001660#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001661 for (i = pairs = 0; i < size; i++)
1662 if (s[i] >= 0x10000)
1663 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001664#endif
Tim Petersced69f82003-09-16 20:30:58 +00001665 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001666 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667 if (v == NULL)
1668 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669
Tim Peters772747b2001-08-09 22:21:55 +00001670 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001672 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001673 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001674 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001675
1676 if (byteorder == -1) {
1677 /* force LE */
1678 ihi = 1;
1679 ilo = 0;
1680 }
1681 else if (byteorder == 1) {
1682 /* force BE */
1683 ihi = 0;
1684 ilo = 1;
1685 }
1686
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001687 while (size-- > 0) {
1688 Py_UNICODE ch = *s++;
1689 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001690#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001691 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001692 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1693 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001695#endif
Tim Peters772747b2001-08-09 22:21:55 +00001696 STORECHAR(ch);
1697 if (ch2)
1698 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001701#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702}
1703
1704PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1705{
1706 if (!PyUnicode_Check(unicode)) {
1707 PyErr_BadArgument();
1708 return NULL;
1709 }
1710 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1711 PyUnicode_GET_SIZE(unicode),
1712 NULL,
1713 0);
1714}
1715
1716/* --- Unicode Escape Codec ----------------------------------------------- */
1717
Fredrik Lundh06d12682001-01-24 07:59:11 +00001718static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001719
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001721 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 const char *errors)
1723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001724 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001725 Py_ssize_t startinpos;
1726 Py_ssize_t endinpos;
1727 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001732 char* message;
1733 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001734 PyObject *errorHandler = NULL;
1735 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001736
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 /* Escaped strings will always be longer than the resulting
1738 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 length after conversion to the true value.
1740 (but if the error callback returns a long replacement string
1741 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 v = _PyUnicode_New(size);
1743 if (v == NULL)
1744 goto onError;
1745 if (size == 0)
1746 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001750
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 while (s < end) {
1752 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001753 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755
1756 /* Non-escape characters are interpreted as Unicode ordinals */
1757 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001758 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 continue;
1760 }
1761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001762 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 /* \ - Escapes */
1764 s++;
1765 switch (*s++) {
1766
1767 /* \x escapes */
1768 case '\n': break;
1769 case '\\': *p++ = '\\'; break;
1770 case '\'': *p++ = '\''; break;
1771 case '\"': *p++ = '\"'; break;
1772 case 'b': *p++ = '\b'; break;
1773 case 'f': *p++ = '\014'; break; /* FF */
1774 case 't': *p++ = '\t'; break;
1775 case 'n': *p++ = '\n'; break;
1776 case 'r': *p++ = '\r'; break;
1777 case 'v': *p++ = '\013'; break; /* VT */
1778 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1779
1780 /* \OOO (octal) escapes */
1781 case '0': case '1': case '2': case '3':
1782 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001783 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001785 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001787 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001789 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 break;
1791
Fredrik Lundhccc74732001-02-18 22:13:49 +00001792 /* hex escapes */
1793 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001795 digits = 2;
1796 message = "truncated \\xXX escape";
1797 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
Fredrik Lundhccc74732001-02-18 22:13:49 +00001799 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801 digits = 4;
1802 message = "truncated \\uXXXX escape";
1803 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804
Fredrik Lundhccc74732001-02-18 22:13:49 +00001805 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001806 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001807 digits = 8;
1808 message = "truncated \\UXXXXXXXX escape";
1809 hexescape:
1810 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001811 outpos = p-PyUnicode_AS_UNICODE(v);
1812 if (s+digits>end) {
1813 endinpos = size;
1814 if (unicode_decode_call_errorhandler(
1815 errors, &errorHandler,
1816 "unicodeescape", "end of string in escape sequence",
1817 starts, size, &startinpos, &endinpos, &exc, &s,
1818 (PyObject **)&v, &outpos, &p))
1819 goto onError;
1820 goto nextByte;
1821 }
1822 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001823 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001824 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001825 endinpos = (s+i+1)-starts;
1826 if (unicode_decode_call_errorhandler(
1827 errors, &errorHandler,
1828 "unicodeescape", message,
1829 starts, size, &startinpos, &endinpos, &exc, &s,
1830 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001831 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001832 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001833 }
1834 chr = (chr<<4) & ~0xF;
1835 if (c >= '0' && c <= '9')
1836 chr += c - '0';
1837 else if (c >= 'a' && c <= 'f')
1838 chr += 10 + c - 'a';
1839 else
1840 chr += 10 + c - 'A';
1841 }
1842 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001843 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 /* _decoding_error will have already written into the
1845 target buffer. */
1846 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001848 /* when we get here, chr is a 32-bit unicode character */
1849 if (chr <= 0xffff)
1850 /* UCS-2 character */
1851 *p++ = (Py_UNICODE) chr;
1852 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001853 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001854 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001855#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001856 *p++ = chr;
1857#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001858 chr -= 0x10000L;
1859 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001860 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001861#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001862 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 endinpos = s-starts;
1864 outpos = p-PyUnicode_AS_UNICODE(v);
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "illegal Unicode character",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001870 goto onError;
1871 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001872 break;
1873
1874 /* \N{name} */
1875 case 'N':
1876 message = "malformed \\N character escape";
1877 if (ucnhash_CAPI == NULL) {
1878 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001879 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001880 m = PyImport_ImportModule("unicodedata");
1881 if (m == NULL)
1882 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001883 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001884 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001885 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001886 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001887 ucnhash_CAPI = PyCObject_AsVoidPtr(api);
1888 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001889 if (ucnhash_CAPI == NULL)
1890 goto ucnhashError;
1891 }
1892 if (*s == '{') {
1893 const char *start = s+1;
1894 /* look for the closing brace */
1895 while (*s != '}' && s < end)
1896 s++;
1897 if (s > start && s < end && *s == '}') {
1898 /* found a name. look it up in the unicode database */
1899 message = "unknown Unicode character name";
1900 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001901 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001902 goto store;
1903 }
1904 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001905 endinpos = s-starts;
1906 outpos = p-PyUnicode_AS_UNICODE(v);
1907 if (unicode_decode_call_errorhandler(
1908 errors, &errorHandler,
1909 "unicodeescape", message,
1910 starts, size, &startinpos, &endinpos, &exc, &s,
1911 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001912 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001913 break;
1914
1915 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001916 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001917 message = "\\ at end of string";
1918 s--;
1919 endinpos = s-starts;
1920 outpos = p-PyUnicode_AS_UNICODE(v);
1921 if (unicode_decode_call_errorhandler(
1922 errors, &errorHandler,
1923 "unicodeescape", message,
1924 starts, size, &startinpos, &endinpos, &exc, &s,
1925 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001926 goto onError;
1927 }
1928 else {
1929 *p++ = '\\';
1930 *p++ = (unsigned char)s[-1];
1931 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001932 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001934 nextByte:
1935 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001937 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001939 Py_XDECREF(errorHandler);
1940 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001942
Fredrik Lundhccc74732001-02-18 22:13:49 +00001943ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001944 PyErr_SetString(
1945 PyExc_UnicodeError,
1946 "\\N escapes not supported (can't load unicodedata module)"
1947 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001948 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001949 Py_XDECREF(errorHandler);
1950 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001951 return NULL;
1952
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001955 Py_XDECREF(errorHandler);
1956 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 return NULL;
1958}
1959
1960/* Return a Unicode-Escape string version of the Unicode object.
1961
1962 If quotes is true, the string is enclosed in u"" or u'' quotes as
1963 appropriate.
1964
1965*/
1966
Barry Warsaw51ac5802000-03-20 16:36:48 +00001967static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001968 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00001969 Py_UNICODE ch);
1970
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971static
1972PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001973 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 int quotes)
1975{
1976 PyObject *repr;
1977 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001979 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980
1981 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1982 if (repr == NULL)
1983 return NULL;
1984
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001985 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986
1987 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001989 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 !findchar(s, size, '"')) ? '"' : '\'';
1991 }
1992 while (size-- > 0) {
1993 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994
Hye-Shik Chang835b2432005-12-17 04:38:31 +00001995 /* Escape quotes and backslashes */
1996 if ((quotes &&
1997 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 *p++ = '\\';
1999 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002000 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002001 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002002
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002003#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002004 /* Map 21-bit characters to '\U00xxxxxx' */
2005 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002006 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002007
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002008 /* Resize the string if necessary */
2009 if (offset + 12 > PyString_GET_SIZE(repr)) {
2010 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002011 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002012 p = PyString_AS_STRING(repr) + offset;
2013 }
2014
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002015 *p++ = '\\';
2016 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002017 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2021 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2022 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2023 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002024 *p++ = hexdigit[ch & 0x0000000F];
2025 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002026 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002027#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002028 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2029 else if (ch >= 0xD800 && ch < 0xDC00) {
2030 Py_UNICODE ch2;
2031 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002032
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002033 ch2 = *s++;
2034 size--;
2035 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2036 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2037 *p++ = '\\';
2038 *p++ = 'U';
2039 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2043 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2044 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2045 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2046 *p++ = hexdigit[ucs & 0x0000000F];
2047 continue;
2048 }
2049 /* Fall through: isolated surrogates are copied as-is */
2050 s--;
2051 size++;
2052 }
2053
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002055 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 *p++ = '\\';
2057 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002058 *p++ = hexdigit[(ch >> 12) & 0x000F];
2059 *p++ = hexdigit[(ch >> 8) & 0x000F];
2060 *p++ = hexdigit[(ch >> 4) & 0x000F];
2061 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002063
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002064 /* Map special whitespace to '\t', \n', '\r' */
2065 else if (ch == '\t') {
2066 *p++ = '\\';
2067 *p++ = 't';
2068 }
2069 else if (ch == '\n') {
2070 *p++ = '\\';
2071 *p++ = 'n';
2072 }
2073 else if (ch == '\r') {
2074 *p++ = '\\';
2075 *p++ = 'r';
2076 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002077
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002078 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002079 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002081 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002082 *p++ = hexdigit[(ch >> 4) & 0x000F];
2083 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002084 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002085
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 /* Copy everything else as-is */
2087 else
2088 *p++ = (char) ch;
2089 }
2090 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002091 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092
2093 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002094 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 return repr;
2096}
2097
2098PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002099 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100{
2101 return unicodeescape_string(s, size, 0);
2102}
2103
2104PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2105{
2106 if (!PyUnicode_Check(unicode)) {
2107 PyErr_BadArgument();
2108 return NULL;
2109 }
2110 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2111 PyUnicode_GET_SIZE(unicode));
2112}
2113
2114/* --- Raw Unicode Escape Codec ------------------------------------------- */
2115
2116PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002117 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 const char *errors)
2119{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002120 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002121 Py_ssize_t startinpos;
2122 Py_ssize_t endinpos;
2123 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 const char *end;
2127 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128 PyObject *errorHandler = NULL;
2129 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002130
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 /* Escaped strings will always be longer than the resulting
2132 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002133 length after conversion to the true value. (But decoding error
2134 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 v = _PyUnicode_New(size);
2136 if (v == NULL)
2137 goto onError;
2138 if (size == 0)
2139 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002140 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 end = s + size;
2142 while (s < end) {
2143 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002144 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002146 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147
2148 /* Non-escape characters are interpreted as Unicode ordinals */
2149 if (*s != '\\') {
2150 *p++ = (unsigned char)*s++;
2151 continue;
2152 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002153 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154
2155 /* \u-escapes are only interpreted iff the number of leading
2156 backslashes if odd */
2157 bs = s;
2158 for (;s < end;) {
2159 if (*s != '\\')
2160 break;
2161 *p++ = (unsigned char)*s++;
2162 }
2163 if (((s - bs) & 1) == 0 ||
2164 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002165 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166 continue;
2167 }
2168 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002169 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 s++;
2171
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002172 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002173 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002174 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002175 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002177 endinpos = s-starts;
2178 if (unicode_decode_call_errorhandler(
2179 errors, &errorHandler,
2180 "rawunicodeescape", "truncated \\uXXXX",
2181 starts, size, &startinpos, &endinpos, &exc, &s,
2182 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002184 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185 }
2186 x = (x<<4) & ~0xF;
2187 if (c >= '0' && c <= '9')
2188 x += c - '0';
2189 else if (c >= 'a' && c <= 'f')
2190 x += 10 + c - 'a';
2191 else
2192 x += 10 + c - 'A';
2193 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002194#ifndef Py_UNICODE_WIDE
2195 if (x > 0x10000) {
2196 if (unicode_decode_call_errorhandler(
2197 errors, &errorHandler,
2198 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2199 starts, size, &startinpos, &endinpos, &exc, &s,
2200 (PyObject **)&v, &outpos, &p))
2201 goto onError;
2202 }
2203#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002204 *p++ = x;
2205 nextByte:
2206 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002208 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002209 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002210 Py_XDECREF(errorHandler);
2211 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002213
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 onError:
2215 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002216 Py_XDECREF(errorHandler);
2217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218 return NULL;
2219}
2220
2221PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002222 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223{
2224 PyObject *repr;
2225 char *p;
2226 char *q;
2227
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002228 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002230#ifdef Py_UNICODE_WIDE
2231 repr = PyString_FromStringAndSize(NULL, 10 * size);
2232#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002234#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235 if (repr == NULL)
2236 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002237 if (size == 0)
2238 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239
2240 p = q = PyString_AS_STRING(repr);
2241 while (size-- > 0) {
2242 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002243#ifdef Py_UNICODE_WIDE
2244 /* Map 32-bit characters to '\Uxxxxxxxx' */
2245 if (ch >= 0x10000) {
2246 *p++ = '\\';
2247 *p++ = 'U';
2248 *p++ = hexdigit[(ch >> 28) & 0xf];
2249 *p++ = hexdigit[(ch >> 24) & 0xf];
2250 *p++ = hexdigit[(ch >> 20) & 0xf];
2251 *p++ = hexdigit[(ch >> 16) & 0xf];
2252 *p++ = hexdigit[(ch >> 12) & 0xf];
2253 *p++ = hexdigit[(ch >> 8) & 0xf];
2254 *p++ = hexdigit[(ch >> 4) & 0xf];
2255 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002256 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002257 else
2258#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 /* Map 16-bit characters to '\uxxxx' */
2260 if (ch >= 256) {
2261 *p++ = '\\';
2262 *p++ = 'u';
2263 *p++ = hexdigit[(ch >> 12) & 0xf];
2264 *p++ = hexdigit[(ch >> 8) & 0xf];
2265 *p++ = hexdigit[(ch >> 4) & 0xf];
2266 *p++ = hexdigit[ch & 15];
2267 }
2268 /* Copy everything else as-is */
2269 else
2270 *p++ = (char) ch;
2271 }
2272 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002273 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return repr;
2275}
2276
2277PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2278{
2279 if (!PyUnicode_Check(unicode)) {
2280 PyErr_BadArgument();
2281 return NULL;
2282 }
2283 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2284 PyUnicode_GET_SIZE(unicode));
2285}
2286
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002287/* --- Unicode Internal Codec ------------------------------------------- */
2288
2289PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002290 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002291 const char *errors)
2292{
2293 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002294 Py_ssize_t startinpos;
2295 Py_ssize_t endinpos;
2296 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002297 PyUnicodeObject *v;
2298 Py_UNICODE *p;
2299 const char *end;
2300 const char *reason;
2301 PyObject *errorHandler = NULL;
2302 PyObject *exc = NULL;
2303
Neal Norwitzd43069c2006-01-08 01:12:10 +00002304#ifdef Py_UNICODE_WIDE
2305 Py_UNICODE unimax = PyUnicode_GetMax();
2306#endif
2307
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002308 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2309 if (v == NULL)
2310 goto onError;
2311 if (PyUnicode_GetSize((PyObject *)v) == 0)
2312 return (PyObject *)v;
2313 p = PyUnicode_AS_UNICODE(v);
2314 end = s + size;
2315
2316 while (s < end) {
2317 *p = *(Py_UNICODE *)s;
2318 /* We have to sanity check the raw data, otherwise doom looms for
2319 some malformed UCS-4 data. */
2320 if (
2321 #ifdef Py_UNICODE_WIDE
2322 *p > unimax || *p < 0 ||
2323 #endif
2324 end-s < Py_UNICODE_SIZE
2325 )
2326 {
2327 startinpos = s - starts;
2328 if (end-s < Py_UNICODE_SIZE) {
2329 endinpos = end-starts;
2330 reason = "truncated input";
2331 }
2332 else {
2333 endinpos = s - starts + Py_UNICODE_SIZE;
2334 reason = "illegal code point (> 0x10FFFF)";
2335 }
2336 outpos = p - PyUnicode_AS_UNICODE(v);
2337 if (unicode_decode_call_errorhandler(
2338 errors, &errorHandler,
2339 "unicode_internal", reason,
2340 starts, size, &startinpos, &endinpos, &exc, &s,
2341 (PyObject **)&v, &outpos, &p)) {
2342 goto onError;
2343 }
2344 }
2345 else {
2346 p++;
2347 s += Py_UNICODE_SIZE;
2348 }
2349 }
2350
2351 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2352 goto onError;
2353 Py_XDECREF(errorHandler);
2354 Py_XDECREF(exc);
2355 return (PyObject *)v;
2356
2357 onError:
2358 Py_XDECREF(v);
2359 Py_XDECREF(errorHandler);
2360 Py_XDECREF(exc);
2361 return NULL;
2362}
2363
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364/* --- Latin-1 Codec ------------------------------------------------------ */
2365
2366PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002367 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368 const char *errors)
2369{
2370 PyUnicodeObject *v;
2371 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002372
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002374 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002375 Py_UNICODE r = *(unsigned char*)s;
2376 return PyUnicode_FromUnicode(&r, 1);
2377 }
2378
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379 v = _PyUnicode_New(size);
2380 if (v == NULL)
2381 goto onError;
2382 if (size == 0)
2383 return (PyObject *)v;
2384 p = PyUnicode_AS_UNICODE(v);
2385 while (size-- > 0)
2386 *p++ = (unsigned char)*s++;
2387 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002388
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389 onError:
2390 Py_XDECREF(v);
2391 return NULL;
2392}
2393
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002394/* create or adjust a UnicodeEncodeError */
2395static void make_encode_exception(PyObject **exceptionObject,
2396 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002397 const Py_UNICODE *unicode, Py_ssize_t size,
2398 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002399 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002401 if (*exceptionObject == NULL) {
2402 *exceptionObject = PyUnicodeEncodeError_Create(
2403 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404 }
2405 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002406 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2407 goto onError;
2408 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2409 goto onError;
2410 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2411 goto onError;
2412 return;
2413 onError:
2414 Py_DECREF(*exceptionObject);
2415 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416 }
2417}
2418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002419/* raises a UnicodeEncodeError */
2420static void raise_encode_exception(PyObject **exceptionObject,
2421 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002422 const Py_UNICODE *unicode, Py_ssize_t size,
2423 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002424 const char *reason)
2425{
2426 make_encode_exception(exceptionObject,
2427 encoding, unicode, size, startpos, endpos, reason);
2428 if (*exceptionObject != NULL)
2429 PyCodec_StrictErrors(*exceptionObject);
2430}
2431
2432/* error handling callback helper:
2433 build arguments, call the callback and check the arguments,
2434 put the result into newpos and return the replacement string, which
2435 has to be freed by the caller */
2436static PyObject *unicode_encode_call_errorhandler(const char *errors,
2437 PyObject **errorHandler,
2438 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002439 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2440 Py_ssize_t startpos, Py_ssize_t endpos,
2441 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002442{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002443 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002444
2445 PyObject *restuple;
2446 PyObject *resunicode;
2447
2448 if (*errorHandler == NULL) {
2449 *errorHandler = PyCodec_LookupError(errors);
2450 if (*errorHandler == NULL)
2451 return NULL;
2452 }
2453
2454 make_encode_exception(exceptionObject,
2455 encoding, unicode, size, startpos, endpos, reason);
2456 if (*exceptionObject == NULL)
2457 return NULL;
2458
2459 restuple = PyObject_CallFunctionObjArgs(
2460 *errorHandler, *exceptionObject, NULL);
2461 if (restuple == NULL)
2462 return NULL;
2463 if (!PyTuple_Check(restuple)) {
2464 PyErr_Format(PyExc_TypeError, &argparse[4]);
2465 Py_DECREF(restuple);
2466 return NULL;
2467 }
2468 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2469 &resunicode, newpos)) {
2470 Py_DECREF(restuple);
2471 return NULL;
2472 }
2473 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002474 *newpos = size+*newpos;
2475 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002476 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002477 Py_DECREF(restuple);
2478 return NULL;
2479 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 Py_INCREF(resunicode);
2481 Py_DECREF(restuple);
2482 return resunicode;
2483}
2484
2485static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002486 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 const char *errors,
2488 int limit)
2489{
2490 /* output object */
2491 PyObject *res;
2492 /* pointers to the beginning and end+1 of input */
2493 const Py_UNICODE *startp = p;
2494 const Py_UNICODE *endp = p + size;
2495 /* pointer to the beginning of the unencodable characters */
2496 /* const Py_UNICODE *badp = NULL; */
2497 /* pointer into the output */
2498 char *str;
2499 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002500 Py_ssize_t respos = 0;
2501 Py_ssize_t ressize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002502 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2503 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2504 PyObject *errorHandler = NULL;
2505 PyObject *exc = NULL;
2506 /* the following variable is used for caching string comparisons
2507 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2508 int known_errorHandler = -1;
2509
2510 /* allocate enough for a simple encoding without
2511 replacements, if we need more, we'll resize */
2512 res = PyString_FromStringAndSize(NULL, size);
2513 if (res == NULL)
2514 goto onError;
2515 if (size == 0)
2516 return res;
2517 str = PyString_AS_STRING(res);
2518 ressize = size;
2519
2520 while (p<endp) {
2521 Py_UNICODE c = *p;
2522
2523 /* can we encode this? */
2524 if (c<limit) {
2525 /* no overflow check, because we know that the space is enough */
2526 *str++ = (char)c;
2527 ++p;
2528 }
2529 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002530 Py_ssize_t unicodepos = p-startp;
2531 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002532 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002533 Py_ssize_t repsize;
2534 Py_ssize_t newpos;
2535 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 Py_UNICODE *uni2;
2537 /* startpos for collecting unencodable chars */
2538 const Py_UNICODE *collstart = p;
2539 const Py_UNICODE *collend = p;
2540 /* find all unecodable characters */
2541 while ((collend < endp) && ((*collend)>=limit))
2542 ++collend;
2543 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2544 if (known_errorHandler==-1) {
2545 if ((errors==NULL) || (!strcmp(errors, "strict")))
2546 known_errorHandler = 1;
2547 else if (!strcmp(errors, "replace"))
2548 known_errorHandler = 2;
2549 else if (!strcmp(errors, "ignore"))
2550 known_errorHandler = 3;
2551 else if (!strcmp(errors, "xmlcharrefreplace"))
2552 known_errorHandler = 4;
2553 else
2554 known_errorHandler = 0;
2555 }
2556 switch (known_errorHandler) {
2557 case 1: /* strict */
2558 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2559 goto onError;
2560 case 2: /* replace */
2561 while (collstart++<collend)
2562 *str++ = '?'; /* fall through */
2563 case 3: /* ignore */
2564 p = collend;
2565 break;
2566 case 4: /* xmlcharrefreplace */
2567 respos = str-PyString_AS_STRING(res);
2568 /* determine replacement size (temporarily (mis)uses p) */
2569 for (p = collstart, repsize = 0; p < collend; ++p) {
2570 if (*p<10)
2571 repsize += 2+1+1;
2572 else if (*p<100)
2573 repsize += 2+2+1;
2574 else if (*p<1000)
2575 repsize += 2+3+1;
2576 else if (*p<10000)
2577 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002578#ifndef Py_UNICODE_WIDE
2579 else
2580 repsize += 2+5+1;
2581#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002582 else if (*p<100000)
2583 repsize += 2+5+1;
2584 else if (*p<1000000)
2585 repsize += 2+6+1;
2586 else
2587 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002588#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589 }
2590 requiredsize = respos+repsize+(endp-collend);
2591 if (requiredsize > ressize) {
2592 if (requiredsize<2*ressize)
2593 requiredsize = 2*ressize;
2594 if (_PyString_Resize(&res, requiredsize))
2595 goto onError;
2596 str = PyString_AS_STRING(res) + respos;
2597 ressize = requiredsize;
2598 }
2599 /* generate replacement (temporarily (mis)uses p) */
2600 for (p = collstart; p < collend; ++p) {
2601 str += sprintf(str, "&#%d;", (int)*p);
2602 }
2603 p = collend;
2604 break;
2605 default:
2606 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2607 encoding, reason, startp, size, &exc,
2608 collstart-startp, collend-startp, &newpos);
2609 if (repunicode == NULL)
2610 goto onError;
2611 /* need more space? (at least enough for what we
2612 have+the replacement+the rest of the string, so
2613 we won't have to check space for encodable characters) */
2614 respos = str-PyString_AS_STRING(res);
2615 repsize = PyUnicode_GET_SIZE(repunicode);
2616 requiredsize = respos+repsize+(endp-collend);
2617 if (requiredsize > ressize) {
2618 if (requiredsize<2*ressize)
2619 requiredsize = 2*ressize;
2620 if (_PyString_Resize(&res, requiredsize)) {
2621 Py_DECREF(repunicode);
2622 goto onError;
2623 }
2624 str = PyString_AS_STRING(res) + respos;
2625 ressize = requiredsize;
2626 }
2627 /* check if there is anything unencodable in the replacement
2628 and copy it to the output */
2629 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2630 c = *uni2;
2631 if (c >= limit) {
2632 raise_encode_exception(&exc, encoding, startp, size,
2633 unicodepos, unicodepos+1, reason);
2634 Py_DECREF(repunicode);
2635 goto onError;
2636 }
2637 *str = (char)c;
2638 }
2639 p = startp + newpos;
2640 Py_DECREF(repunicode);
2641 }
2642 }
2643 }
2644 /* Resize if we allocated to much */
2645 respos = str-PyString_AS_STRING(res);
2646 if (respos<ressize)
2647 /* If this falls res will be NULL */
2648 _PyString_Resize(&res, respos);
2649 Py_XDECREF(errorHandler);
2650 Py_XDECREF(exc);
2651 return res;
2652
2653 onError:
2654 Py_XDECREF(res);
2655 Py_XDECREF(errorHandler);
2656 Py_XDECREF(exc);
2657 return NULL;
2658}
2659
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002661 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 const char *errors)
2663{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002664 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665}
2666
2667PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2668{
2669 if (!PyUnicode_Check(unicode)) {
2670 PyErr_BadArgument();
2671 return NULL;
2672 }
2673 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2674 PyUnicode_GET_SIZE(unicode),
2675 NULL);
2676}
2677
2678/* --- 7-bit ASCII Codec -------------------------------------------------- */
2679
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002681 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 const char *errors)
2683{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002684 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 PyUnicodeObject *v;
2686 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002687 Py_ssize_t startinpos;
2688 Py_ssize_t endinpos;
2689 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 const char *e;
2691 PyObject *errorHandler = NULL;
2692 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002693
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002695 if (size == 1 && *(unsigned char*)s < 128) {
2696 Py_UNICODE r = *(unsigned char*)s;
2697 return PyUnicode_FromUnicode(&r, 1);
2698 }
Tim Petersced69f82003-09-16 20:30:58 +00002699
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 v = _PyUnicode_New(size);
2701 if (v == NULL)
2702 goto onError;
2703 if (size == 0)
2704 return (PyObject *)v;
2705 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 e = s + size;
2707 while (s < e) {
2708 register unsigned char c = (unsigned char)*s;
2709 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 ++s;
2712 }
2713 else {
2714 startinpos = s-starts;
2715 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002716 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002717 if (unicode_decode_call_errorhandler(
2718 errors, &errorHandler,
2719 "ascii", "ordinal not in range(128)",
2720 starts, size, &startinpos, &endinpos, &exc, &s,
2721 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002725 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002726 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002727 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002731
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 onError:
2733 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 Py_XDECREF(errorHandler);
2735 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 return NULL;
2737}
2738
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002740 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 const char *errors)
2742{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744}
2745
2746PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2747{
2748 if (!PyUnicode_Check(unicode)) {
2749 PyErr_BadArgument();
2750 return NULL;
2751 }
2752 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2753 PyUnicode_GET_SIZE(unicode),
2754 NULL);
2755}
2756
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002757#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002758
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002759/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002760
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002761PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002762 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002763 const char *errors)
2764{
2765 PyUnicodeObject *v;
2766 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002767 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002768
2769 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002770 assert(size < INT_MAX);
2771 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002772 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002773 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2774
2775 v = _PyUnicode_New(usize);
2776 if (v == NULL)
2777 return NULL;
2778 if (usize == 0)
2779 return (PyObject *)v;
2780 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002781 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002782 Py_DECREF(v);
2783 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2784 }
2785
2786 return (PyObject *)v;
2787}
2788
2789PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002790 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002791 const char *errors)
2792{
2793 PyObject *repr;
2794 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002795 DWORD mbcssize;
2796
2797 /* If there are no characters, bail now! */
2798 if (size==0)
2799 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002800
2801 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002802 assert(size<INT_MAX);
2803 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002804 if (mbcssize==0)
2805 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2806
2807 repr = PyString_FromStringAndSize(NULL, mbcssize);
2808 if (repr == NULL)
2809 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002810 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002811 return repr;
2812
2813 /* Do the conversion */
2814 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002815 assert(size < INT_MAX);
2816 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002817 Py_DECREF(repr);
2818 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2819 }
2820 return repr;
2821}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002822
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002823PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2824{
2825 if (!PyUnicode_Check(unicode)) {
2826 PyErr_BadArgument();
2827 return NULL;
2828 }
2829 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2830 PyUnicode_GET_SIZE(unicode),
2831 NULL);
2832}
2833
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002834#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002835
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836/* --- Character Mapping Codec -------------------------------------------- */
2837
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002839 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 PyObject *mapping,
2841 const char *errors)
2842{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002844 Py_ssize_t startinpos;
2845 Py_ssize_t endinpos;
2846 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 PyUnicodeObject *v;
2849 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002850 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 PyObject *errorHandler = NULL;
2852 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002853 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002854 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002855
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 /* Default to Latin-1 */
2857 if (mapping == NULL)
2858 return PyUnicode_DecodeLatin1(s, size, errors);
2859
2860 v = _PyUnicode_New(size);
2861 if (v == NULL)
2862 goto onError;
2863 if (size == 0)
2864 return (PyObject *)v;
2865 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002866 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002867 if (PyUnicode_CheckExact(mapping)) {
2868 mapstring = PyUnicode_AS_UNICODE(mapping);
2869 maplen = PyUnicode_GET_SIZE(mapping);
2870 while (s < e) {
2871 unsigned char ch = *s;
2872 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002874 if (ch < maplen)
2875 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002877 if (x == 0xfffe) {
2878 /* undefined mapping */
2879 outpos = p-PyUnicode_AS_UNICODE(v);
2880 startinpos = s-starts;
2881 endinpos = startinpos+1;
2882 if (unicode_decode_call_errorhandler(
2883 errors, &errorHandler,
2884 "charmap", "character maps to <undefined>",
2885 starts, size, &startinpos, &endinpos, &exc, &s,
2886 (PyObject **)&v, &outpos, &p)) {
2887 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002888 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002889 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002890 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002891 *p++ = x;
2892 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002894 }
2895 else {
2896 while (s < e) {
2897 unsigned char ch = *s;
2898 PyObject *w, *x;
2899
2900 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2901 w = PyInt_FromLong((long)ch);
2902 if (w == NULL)
2903 goto onError;
2904 x = PyObject_GetItem(mapping, w);
2905 Py_DECREF(w);
2906 if (x == NULL) {
2907 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2908 /* No mapping found means: mapping is undefined. */
2909 PyErr_Clear();
2910 x = Py_None;
2911 Py_INCREF(x);
2912 } else
2913 goto onError;
2914 }
2915
2916 /* Apply mapping */
2917 if (PyInt_Check(x)) {
2918 long value = PyInt_AS_LONG(x);
2919 if (value < 0 || value > 65535) {
2920 PyErr_SetString(PyExc_TypeError,
2921 "character mapping must be in range(65536)");
2922 Py_DECREF(x);
2923 goto onError;
2924 }
2925 *p++ = (Py_UNICODE)value;
2926 }
2927 else if (x == Py_None) {
2928 /* undefined mapping */
2929 outpos = p-PyUnicode_AS_UNICODE(v);
2930 startinpos = s-starts;
2931 endinpos = startinpos+1;
2932 if (unicode_decode_call_errorhandler(
2933 errors, &errorHandler,
2934 "charmap", "character maps to <undefined>",
2935 starts, size, &startinpos, &endinpos, &exc, &s,
2936 (PyObject **)&v, &outpos, &p)) {
2937 Py_DECREF(x);
2938 goto onError;
2939 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002940 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002941 continue;
2942 }
2943 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002944 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002945
2946 if (targetsize == 1)
2947 /* 1-1 mapping */
2948 *p++ = *PyUnicode_AS_UNICODE(x);
2949
2950 else if (targetsize > 1) {
2951 /* 1-n mapping */
2952 if (targetsize > extrachars) {
2953 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002954 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
2955 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002956 (targetsize << 2);
2957 extrachars += needed;
2958 if (_PyUnicode_Resize(&v,
2959 PyUnicode_GET_SIZE(v) + needed) < 0) {
2960 Py_DECREF(x);
2961 goto onError;
2962 }
2963 p = PyUnicode_AS_UNICODE(v) + oldpos;
2964 }
2965 Py_UNICODE_COPY(p,
2966 PyUnicode_AS_UNICODE(x),
2967 targetsize);
2968 p += targetsize;
2969 extrachars -= targetsize;
2970 }
2971 /* 1-0 mapping: skip the character */
2972 }
2973 else {
2974 /* wrong return value */
2975 PyErr_SetString(PyExc_TypeError,
2976 "character mapping must return integer, None or unicode");
2977 Py_DECREF(x);
2978 goto onError;
2979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002981 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 }
2984 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002985 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002987 Py_XDECREF(errorHandler);
2988 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002990
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002992 Py_XDECREF(errorHandler);
2993 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 Py_XDECREF(v);
2995 return NULL;
2996}
2997
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998/* Lookup the character ch in the mapping. If the character
2999 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003000 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003001static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003 PyObject *w = PyInt_FromLong((long)c);
3004 PyObject *x;
3005
3006 if (w == NULL)
3007 return NULL;
3008 x = PyObject_GetItem(mapping, w);
3009 Py_DECREF(w);
3010 if (x == NULL) {
3011 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3012 /* No mapping found means: mapping is undefined. */
3013 PyErr_Clear();
3014 x = Py_None;
3015 Py_INCREF(x);
3016 return x;
3017 } else
3018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003020 else if (x == Py_None)
3021 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 else if (PyInt_Check(x)) {
3023 long value = PyInt_AS_LONG(x);
3024 if (value < 0 || value > 255) {
3025 PyErr_SetString(PyExc_TypeError,
3026 "character mapping must be in range(256)");
3027 Py_DECREF(x);
3028 return NULL;
3029 }
3030 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 else if (PyString_Check(x))
3033 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003035 /* wrong return value */
3036 PyErr_SetString(PyExc_TypeError,
3037 "character mapping must return integer, None or str");
3038 Py_DECREF(x);
3039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 }
3041}
3042
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043/* lookup the character, put the result in the output string and adjust
3044 various state variables. Reallocate the output string if not enough
3045 space is available. Return a new reference to the object that
3046 was put in the output buffer, or Py_None, if the mapping was undefined
3047 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003048 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049static
3050PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003051 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052{
3053 PyObject *rep = charmapencode_lookup(c, mapping);
3054
3055 if (rep==NULL)
3056 return NULL;
3057 else if (rep==Py_None)
3058 return rep;
3059 else {
3060 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003061 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003063 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 if (outsize<requiredsize) {
3065 /* exponentially overallocate to minimize reallocations */
3066 if (requiredsize < 2*outsize)
3067 requiredsize = 2*outsize;
3068 if (_PyString_Resize(outobj, requiredsize)) {
3069 Py_DECREF(rep);
3070 return NULL;
3071 }
3072 outstart = PyString_AS_STRING(*outobj);
3073 }
3074 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3075 }
3076 else {
3077 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003078 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3079 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 if (outsize<requiredsize) {
3081 /* exponentially overallocate to minimize reallocations */
3082 if (requiredsize < 2*outsize)
3083 requiredsize = 2*outsize;
3084 if (_PyString_Resize(outobj, requiredsize)) {
3085 Py_DECREF(rep);
3086 return NULL;
3087 }
3088 outstart = PyString_AS_STRING(*outobj);
3089 }
3090 memcpy(outstart + *outpos, repchars, repsize);
3091 *outpos += repsize;
3092 }
3093 }
3094 return rep;
3095}
3096
3097/* handle an error in PyUnicode_EncodeCharmap
3098 Return 0 on success, -1 on error */
3099static
3100int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003101 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003102 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003103 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003104 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003105{
3106 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003107 Py_ssize_t repsize;
3108 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003109 Py_UNICODE *uni2;
3110 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003111 Py_ssize_t collstartpos = *inpos;
3112 Py_ssize_t collendpos = *inpos+1;
3113 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114 char *encoding = "charmap";
3115 char *reason = "character maps to <undefined>";
3116
3117 PyObject *x;
3118 /* find all unencodable characters */
3119 while (collendpos < size) {
3120 x = charmapencode_lookup(p[collendpos], mapping);
3121 if (x==NULL)
3122 return -1;
3123 else if (x!=Py_None) {
3124 Py_DECREF(x);
3125 break;
3126 }
3127 Py_DECREF(x);
3128 ++collendpos;
3129 }
3130 /* cache callback name lookup
3131 * (if not done yet, i.e. it's the first error) */
3132 if (*known_errorHandler==-1) {
3133 if ((errors==NULL) || (!strcmp(errors, "strict")))
3134 *known_errorHandler = 1;
3135 else if (!strcmp(errors, "replace"))
3136 *known_errorHandler = 2;
3137 else if (!strcmp(errors, "ignore"))
3138 *known_errorHandler = 3;
3139 else if (!strcmp(errors, "xmlcharrefreplace"))
3140 *known_errorHandler = 4;
3141 else
3142 *known_errorHandler = 0;
3143 }
3144 switch (*known_errorHandler) {
3145 case 1: /* strict */
3146 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3147 return -1;
3148 case 2: /* replace */
3149 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3150 x = charmapencode_output('?', mapping, res, respos);
3151 if (x==NULL) {
3152 return -1;
3153 }
3154 else if (x==Py_None) {
3155 Py_DECREF(x);
3156 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3157 return -1;
3158 }
3159 Py_DECREF(x);
3160 }
3161 /* fall through */
3162 case 3: /* ignore */
3163 *inpos = collendpos;
3164 break;
3165 case 4: /* xmlcharrefreplace */
3166 /* generate replacement (temporarily (mis)uses p) */
3167 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3168 char buffer[2+29+1+1];
3169 char *cp;
3170 sprintf(buffer, "&#%d;", (int)p[collpos]);
3171 for (cp = buffer; *cp; ++cp) {
3172 x = charmapencode_output(*cp, mapping, res, respos);
3173 if (x==NULL)
3174 return -1;
3175 else if (x==Py_None) {
3176 Py_DECREF(x);
3177 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3178 return -1;
3179 }
3180 Py_DECREF(x);
3181 }
3182 }
3183 *inpos = collendpos;
3184 break;
3185 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003186 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003187 encoding, reason, p, size, exceptionObject,
3188 collstartpos, collendpos, &newpos);
3189 if (repunicode == NULL)
3190 return -1;
3191 /* generate replacement */
3192 repsize = PyUnicode_GET_SIZE(repunicode);
3193 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3194 x = charmapencode_output(*uni2, mapping, res, respos);
3195 if (x==NULL) {
3196 Py_DECREF(repunicode);
3197 return -1;
3198 }
3199 else if (x==Py_None) {
3200 Py_DECREF(repunicode);
3201 Py_DECREF(x);
3202 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3203 return -1;
3204 }
3205 Py_DECREF(x);
3206 }
3207 *inpos = newpos;
3208 Py_DECREF(repunicode);
3209 }
3210 return 0;
3211}
3212
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003214 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 PyObject *mapping,
3216 const char *errors)
3217{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 /* output object */
3219 PyObject *res = NULL;
3220 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003221 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003222 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003223 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 PyObject *errorHandler = NULL;
3225 PyObject *exc = NULL;
3226 /* the following variable is used for caching string comparisons
3227 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3228 * 3=ignore, 4=xmlcharrefreplace */
3229 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230
3231 /* Default to Latin-1 */
3232 if (mapping == NULL)
3233 return PyUnicode_EncodeLatin1(p, size, errors);
3234
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003235 /* allocate enough for a simple encoding without
3236 replacements, if we need more, we'll resize */
3237 res = PyString_FromStringAndSize(NULL, size);
3238 if (res == NULL)
3239 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003240 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 while (inpos<size) {
3244 /* try to encode it */
3245 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3246 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 if (x==Py_None) { /* unencodable character */
3249 if (charmap_encoding_error(p, size, &inpos, mapping,
3250 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003251 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003252 &res, &respos)) {
3253 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003254 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 else
3258 /* done with this character => adjust input position */
3259 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 Py_DECREF(x);
3261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 /* Resize if we allocated to much */
3264 if (respos<PyString_GET_SIZE(res)) {
3265 if (_PyString_Resize(&res, respos))
3266 goto onError;
3267 }
3268 Py_XDECREF(exc);
3269 Py_XDECREF(errorHandler);
3270 return res;
3271
3272 onError:
3273 Py_XDECREF(res);
3274 Py_XDECREF(exc);
3275 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276 return NULL;
3277}
3278
3279PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3280 PyObject *mapping)
3281{
3282 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3283 PyErr_BadArgument();
3284 return NULL;
3285 }
3286 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3287 PyUnicode_GET_SIZE(unicode),
3288 mapping,
3289 NULL);
3290}
3291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292/* create or adjust a UnicodeTranslateError */
3293static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003294 const Py_UNICODE *unicode, Py_ssize_t size,
3295 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 if (*exceptionObject == NULL) {
3299 *exceptionObject = PyUnicodeTranslateError_Create(
3300 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 }
3302 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3304 goto onError;
3305 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3306 goto onError;
3307 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3308 goto onError;
3309 return;
3310 onError:
3311 Py_DECREF(*exceptionObject);
3312 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 }
3314}
3315
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003316/* raises a UnicodeTranslateError */
3317static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003318 const Py_UNICODE *unicode, Py_ssize_t size,
3319 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 const char *reason)
3321{
3322 make_translate_exception(exceptionObject,
3323 unicode, size, startpos, endpos, reason);
3324 if (*exceptionObject != NULL)
3325 PyCodec_StrictErrors(*exceptionObject);
3326}
3327
3328/* error handling callback helper:
3329 build arguments, call the callback and check the arguments,
3330 put the result into newpos and return the replacement string, which
3331 has to be freed by the caller */
3332static PyObject *unicode_translate_call_errorhandler(const char *errors,
3333 PyObject **errorHandler,
3334 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003335 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3336 Py_ssize_t startpos, Py_ssize_t endpos,
3337 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338{
3339 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3340
Martin v. Löwis18e16552006-02-15 17:27:45 +00003341 int i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 PyObject *restuple;
3343 PyObject *resunicode;
3344
3345 if (*errorHandler == NULL) {
3346 *errorHandler = PyCodec_LookupError(errors);
3347 if (*errorHandler == NULL)
3348 return NULL;
3349 }
3350
3351 make_translate_exception(exceptionObject,
3352 unicode, size, startpos, endpos, reason);
3353 if (*exceptionObject == NULL)
3354 return NULL;
3355
3356 restuple = PyObject_CallFunctionObjArgs(
3357 *errorHandler, *exceptionObject, NULL);
3358 if (restuple == NULL)
3359 return NULL;
3360 if (!PyTuple_Check(restuple)) {
3361 PyErr_Format(PyExc_TypeError, &argparse[4]);
3362 Py_DECREF(restuple);
3363 return NULL;
3364 }
3365 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003366 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367 Py_DECREF(restuple);
3368 return NULL;
3369 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003370 if (i_newpos<0)
3371 *newpos = size+i_newpos;
3372 else
3373 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003374 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003375 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003376 Py_DECREF(restuple);
3377 return NULL;
3378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 Py_INCREF(resunicode);
3380 Py_DECREF(restuple);
3381 return resunicode;
3382}
3383
3384/* Lookup the character ch in the mapping and put the result in result,
3385 which must be decrefed by the caller.
3386 Return 0 on success, -1 on error */
3387static
3388int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3389{
3390 PyObject *w = PyInt_FromLong((long)c);
3391 PyObject *x;
3392
3393 if (w == NULL)
3394 return -1;
3395 x = PyObject_GetItem(mapping, w);
3396 Py_DECREF(w);
3397 if (x == NULL) {
3398 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3399 /* No mapping found means: use 1:1 mapping. */
3400 PyErr_Clear();
3401 *result = NULL;
3402 return 0;
3403 } else
3404 return -1;
3405 }
3406 else if (x == Py_None) {
3407 *result = x;
3408 return 0;
3409 }
3410 else if (PyInt_Check(x)) {
3411 long value = PyInt_AS_LONG(x);
3412 long max = PyUnicode_GetMax();
3413 if (value < 0 || value > max) {
3414 PyErr_Format(PyExc_TypeError,
3415 "character mapping must be in range(0x%lx)", max+1);
3416 Py_DECREF(x);
3417 return -1;
3418 }
3419 *result = x;
3420 return 0;
3421 }
3422 else if (PyUnicode_Check(x)) {
3423 *result = x;
3424 return 0;
3425 }
3426 else {
3427 /* wrong return value */
3428 PyErr_SetString(PyExc_TypeError,
3429 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003430 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 return -1;
3432 }
3433}
3434/* ensure that *outobj is at least requiredsize characters long,
3435if not reallocate and adjust various state variables.
3436Return 0 on success, -1 on error */
3437static
Walter Dörwald4894c302003-10-24 14:25:28 +00003438int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003439 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003441 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003442 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003443 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003444 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003446 if (requiredsize < 2 * oldsize)
3447 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003448 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449 return -1;
3450 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 }
3452 return 0;
3453}
3454/* lookup the character, put the result in the output string and adjust
3455 various state variables. Return a new reference to the object that
3456 was put in the output buffer in *result, or Py_None, if the mapping was
3457 undefined (in which case no character was written).
3458 The called must decref result.
3459 Return 0 on success, -1 on error. */
3460static
Walter Dörwald4894c302003-10-24 14:25:28 +00003461int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003462 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003463 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464{
Walter Dörwald4894c302003-10-24 14:25:28 +00003465 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466 return -1;
3467 if (*res==NULL) {
3468 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003469 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 }
3471 else if (*res==Py_None)
3472 ;
3473 else if (PyInt_Check(*res)) {
3474 /* no overflow check, because we know that the space is enough */
3475 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3476 }
3477 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003478 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 if (repsize==1) {
3480 /* no overflow check, because we know that the space is enough */
3481 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3482 }
3483 else if (repsize!=0) {
3484 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003485 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003486 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003487 repsize - 1;
3488 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 return -1;
3490 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3491 *outp += repsize;
3492 }
3493 }
3494 else
3495 return -1;
3496 return 0;
3497}
3498
3499PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003500 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 PyObject *mapping,
3502 const char *errors)
3503{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504 /* output object */
3505 PyObject *res = NULL;
3506 /* pointers to the beginning and end+1 of input */
3507 const Py_UNICODE *startp = p;
3508 const Py_UNICODE *endp = p + size;
3509 /* pointer into the output */
3510 Py_UNICODE *str;
3511 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003512 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 char *reason = "character maps to <undefined>";
3514 PyObject *errorHandler = NULL;
3515 PyObject *exc = NULL;
3516 /* the following variable is used for caching string comparisons
3517 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3518 * 3=ignore, 4=xmlcharrefreplace */
3519 int known_errorHandler = -1;
3520
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 if (mapping == NULL) {
3522 PyErr_BadArgument();
3523 return NULL;
3524 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525
3526 /* allocate enough for a simple 1:1 translation without
3527 replacements, if we need more, we'll resize */
3528 res = PyUnicode_FromUnicode(NULL, size);
3529 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 return res;
3533 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 while (p<endp) {
3536 /* try to encode it */
3537 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003538 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 goto onError;
3541 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003542 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 if (x!=Py_None) /* it worked => adjust input pointer */
3544 ++p;
3545 else { /* untranslatable character */
3546 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003547 Py_ssize_t repsize;
3548 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 Py_UNICODE *uni2;
3550 /* startpos for collecting untranslatable chars */
3551 const Py_UNICODE *collstart = p;
3552 const Py_UNICODE *collend = p+1;
3553 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 /* find all untranslatable characters */
3556 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003557 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 goto onError;
3559 Py_XDECREF(x);
3560 if (x!=Py_None)
3561 break;
3562 ++collend;
3563 }
3564 /* cache callback name lookup
3565 * (if not done yet, i.e. it's the first error) */
3566 if (known_errorHandler==-1) {
3567 if ((errors==NULL) || (!strcmp(errors, "strict")))
3568 known_errorHandler = 1;
3569 else if (!strcmp(errors, "replace"))
3570 known_errorHandler = 2;
3571 else if (!strcmp(errors, "ignore"))
3572 known_errorHandler = 3;
3573 else if (!strcmp(errors, "xmlcharrefreplace"))
3574 known_errorHandler = 4;
3575 else
3576 known_errorHandler = 0;
3577 }
3578 switch (known_errorHandler) {
3579 case 1: /* strict */
3580 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3581 goto onError;
3582 case 2: /* replace */
3583 /* No need to check for space, this is a 1:1 replacement */
3584 for (coll = collstart; coll<collend; ++coll)
3585 *str++ = '?';
3586 /* fall through */
3587 case 3: /* ignore */
3588 p = collend;
3589 break;
3590 case 4: /* xmlcharrefreplace */
3591 /* generate replacement (temporarily (mis)uses p) */
3592 for (p = collstart; p < collend; ++p) {
3593 char buffer[2+29+1+1];
3594 char *cp;
3595 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003596 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3598 goto onError;
3599 for (cp = buffer; *cp; ++cp)
3600 *str++ = *cp;
3601 }
3602 p = collend;
3603 break;
3604 default:
3605 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3606 reason, startp, size, &exc,
3607 collstart-startp, collend-startp, &newpos);
3608 if (repunicode == NULL)
3609 goto onError;
3610 /* generate replacement */
3611 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003612 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3614 Py_DECREF(repunicode);
3615 goto onError;
3616 }
3617 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3618 *str++ = *uni2;
3619 p = startp + newpos;
3620 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 }
3622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 /* Resize if we allocated to much */
3625 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003626 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003627 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003628 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 }
3630 Py_XDECREF(exc);
3631 Py_XDECREF(errorHandler);
3632 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 onError:
3635 Py_XDECREF(res);
3636 Py_XDECREF(exc);
3637 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 return NULL;
3639}
3640
3641PyObject *PyUnicode_Translate(PyObject *str,
3642 PyObject *mapping,
3643 const char *errors)
3644{
3645 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003646
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 str = PyUnicode_FromObject(str);
3648 if (str == NULL)
3649 goto onError;
3650 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3651 PyUnicode_GET_SIZE(str),
3652 mapping,
3653 errors);
3654 Py_DECREF(str);
3655 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003656
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 onError:
3658 Py_XDECREF(str);
3659 return NULL;
3660}
Tim Petersced69f82003-09-16 20:30:58 +00003661
Guido van Rossum9e896b32000-04-05 20:11:21 +00003662/* --- Decimal Encoder ---------------------------------------------------- */
3663
3664int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003665 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003666 char *output,
3667 const char *errors)
3668{
3669 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 PyObject *errorHandler = NULL;
3671 PyObject *exc = NULL;
3672 const char *encoding = "decimal";
3673 const char *reason = "invalid decimal Unicode string";
3674 /* the following variable is used for caching string comparisons
3675 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3676 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003677
3678 if (output == NULL) {
3679 PyErr_BadArgument();
3680 return -1;
3681 }
3682
3683 p = s;
3684 end = s + length;
3685 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003687 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003689 Py_ssize_t repsize;
3690 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 Py_UNICODE *uni2;
3692 Py_UNICODE *collstart;
3693 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003694
Guido van Rossum9e896b32000-04-05 20:11:21 +00003695 if (Py_UNICODE_ISSPACE(ch)) {
3696 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003697 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003698 continue;
3699 }
3700 decimal = Py_UNICODE_TODECIMAL(ch);
3701 if (decimal >= 0) {
3702 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003704 continue;
3705 }
Guido van Rossumba477042000-04-06 18:18:10 +00003706 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003707 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003709 continue;
3710 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 /* All other characters are considered unencodable */
3712 collstart = p;
3713 collend = p+1;
3714 while (collend < end) {
3715 if ((0 < *collend && *collend < 256) ||
3716 !Py_UNICODE_ISSPACE(*collend) ||
3717 Py_UNICODE_TODECIMAL(*collend))
3718 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003719 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 /* cache callback name lookup
3721 * (if not done yet, i.e. it's the first error) */
3722 if (known_errorHandler==-1) {
3723 if ((errors==NULL) || (!strcmp(errors, "strict")))
3724 known_errorHandler = 1;
3725 else if (!strcmp(errors, "replace"))
3726 known_errorHandler = 2;
3727 else if (!strcmp(errors, "ignore"))
3728 known_errorHandler = 3;
3729 else if (!strcmp(errors, "xmlcharrefreplace"))
3730 known_errorHandler = 4;
3731 else
3732 known_errorHandler = 0;
3733 }
3734 switch (known_errorHandler) {
3735 case 1: /* strict */
3736 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3737 goto onError;
3738 case 2: /* replace */
3739 for (p = collstart; p < collend; ++p)
3740 *output++ = '?';
3741 /* fall through */
3742 case 3: /* ignore */
3743 p = collend;
3744 break;
3745 case 4: /* xmlcharrefreplace */
3746 /* generate replacement (temporarily (mis)uses p) */
3747 for (p = collstart; p < collend; ++p)
3748 output += sprintf(output, "&#%d;", (int)*p);
3749 p = collend;
3750 break;
3751 default:
3752 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3753 encoding, reason, s, length, &exc,
3754 collstart-s, collend-s, &newpos);
3755 if (repunicode == NULL)
3756 goto onError;
3757 /* generate replacement */
3758 repsize = PyUnicode_GET_SIZE(repunicode);
3759 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3760 Py_UNICODE ch = *uni2;
3761 if (Py_UNICODE_ISSPACE(ch))
3762 *output++ = ' ';
3763 else {
3764 decimal = Py_UNICODE_TODECIMAL(ch);
3765 if (decimal >= 0)
3766 *output++ = '0' + decimal;
3767 else if (0 < ch && ch < 256)
3768 *output++ = (char)ch;
3769 else {
3770 Py_DECREF(repunicode);
3771 raise_encode_exception(&exc, encoding,
3772 s, length, collstart-s, collend-s, reason);
3773 goto onError;
3774 }
3775 }
3776 }
3777 p = s + newpos;
3778 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003779 }
3780 }
3781 /* 0-terminate the output string */
3782 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 Py_XDECREF(exc);
3784 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003785 return 0;
3786
3787 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 Py_XDECREF(exc);
3789 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003790 return -1;
3791}
3792
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793/* --- Helpers ------------------------------------------------------------ */
3794
Tim Petersced69f82003-09-16 20:30:58 +00003795static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003796Py_ssize_t count(PyUnicodeObject *self,
3797 Py_ssize_t start,
3798 Py_ssize_t end,
3799 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800{
3801 int count = 0;
3802
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003803 if (start < 0)
3804 start += self->length;
3805 if (start < 0)
3806 start = 0;
3807 if (end > self->length)
3808 end = self->length;
3809 if (end < 0)
3810 end += self->length;
3811 if (end < 0)
3812 end = 0;
3813
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003814 if (substring->length == 0)
3815 return (end - start + 1);
3816
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 end -= substring->length;
3818
3819 while (start <= end)
3820 if (Py_UNICODE_MATCH(self, start, substring)) {
3821 count++;
3822 start += substring->length;
3823 } else
3824 start++;
3825
3826 return count;
3827}
3828
Martin v. Löwis18e16552006-02-15 17:27:45 +00003829Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003831 Py_ssize_t start,
3832 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003834 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 str = PyUnicode_FromObject(str);
3837 if (str == NULL)
3838 return -1;
3839 substr = PyUnicode_FromObject(substr);
3840 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003841 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842 return -1;
3843 }
Tim Petersced69f82003-09-16 20:30:58 +00003844
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845 result = count((PyUnicodeObject *)str,
3846 start, end,
3847 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003848
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 Py_DECREF(str);
3850 Py_DECREF(substr);
3851 return result;
3852}
3853
Tim Petersced69f82003-09-16 20:30:58 +00003854static
Martin v. Löwis18e16552006-02-15 17:27:45 +00003855Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003857 Py_ssize_t start,
3858 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 int direction)
3860{
3861 if (start < 0)
3862 start += self->length;
3863 if (start < 0)
3864 start = 0;
3865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 if (end > self->length)
3867 end = self->length;
3868 if (end < 0)
3869 end += self->length;
3870 if (end < 0)
3871 end = 0;
3872
Guido van Rossum76afbd92002-08-20 17:29:29 +00003873 if (substring->length == 0)
3874 return (direction > 0) ? start : end;
3875
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 end -= substring->length;
3877
3878 if (direction < 0) {
3879 for (; end >= start; end--)
3880 if (Py_UNICODE_MATCH(self, end, substring))
3881 return end;
3882 } else {
3883 for (; start <= end; start++)
3884 if (Py_UNICODE_MATCH(self, start, substring))
3885 return start;
3886 }
3887
3888 return -1;
3889}
3890
Martin v. Löwis18e16552006-02-15 17:27:45 +00003891Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003893 Py_ssize_t start,
3894 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 int direction)
3896{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003897 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003898
Guido van Rossumd57fd912000-03-10 22:53:23 +00003899 str = PyUnicode_FromObject(str);
3900 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003901 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 substr = PyUnicode_FromObject(substr);
3903 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003904 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003905 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 }
Tim Petersced69f82003-09-16 20:30:58 +00003907
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 result = findstring((PyUnicodeObject *)str,
3909 (PyUnicodeObject *)substr,
3910 start, end, direction);
3911 Py_DECREF(str);
3912 Py_DECREF(substr);
3913 return result;
3914}
3915
Tim Petersced69f82003-09-16 20:30:58 +00003916static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917int tailmatch(PyUnicodeObject *self,
3918 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003919 Py_ssize_t start,
3920 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 int direction)
3922{
3923 if (start < 0)
3924 start += self->length;
3925 if (start < 0)
3926 start = 0;
3927
3928 if (substring->length == 0)
3929 return 1;
3930
3931 if (end > self->length)
3932 end = self->length;
3933 if (end < 0)
3934 end += self->length;
3935 if (end < 0)
3936 end = 0;
3937
3938 end -= substring->length;
3939 if (end < start)
3940 return 0;
3941
3942 if (direction > 0) {
3943 if (Py_UNICODE_MATCH(self, end, substring))
3944 return 1;
3945 } else {
3946 if (Py_UNICODE_MATCH(self, start, substring))
3947 return 1;
3948 }
3949
3950 return 0;
3951}
3952
Martin v. Löwis18e16552006-02-15 17:27:45 +00003953Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003955 Py_ssize_t start,
3956 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 int direction)
3958{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003960
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 str = PyUnicode_FromObject(str);
3962 if (str == NULL)
3963 return -1;
3964 substr = PyUnicode_FromObject(substr);
3965 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003966 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 return -1;
3968 }
Tim Petersced69f82003-09-16 20:30:58 +00003969
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 result = tailmatch((PyUnicodeObject *)str,
3971 (PyUnicodeObject *)substr,
3972 start, end, direction);
3973 Py_DECREF(str);
3974 Py_DECREF(substr);
3975 return result;
3976}
3977
Tim Petersced69f82003-09-16 20:30:58 +00003978static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 Py_UNICODE ch)
3982{
3983 /* like wcschr, but doesn't stop at NULL characters */
3984
3985 while (size-- > 0) {
3986 if (*s == ch)
3987 return s;
3988 s++;
3989 }
3990
3991 return NULL;
3992}
3993
3994/* Apply fixfct filter to the Unicode object self and return a
3995 reference to the modified object */
3996
Tim Petersced69f82003-09-16 20:30:58 +00003997static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998PyObject *fixup(PyUnicodeObject *self,
3999 int (*fixfct)(PyUnicodeObject *s))
4000{
4001
4002 PyUnicodeObject *u;
4003
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004004 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 if (u == NULL)
4006 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004007
4008 Py_UNICODE_COPY(u->str, self->str, self->length);
4009
Tim Peters7a29bd52001-09-12 03:03:31 +00004010 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 /* fixfct should return TRUE if it modified the buffer. If
4012 FALSE, return a reference to the original buffer instead
4013 (to save space, not time) */
4014 Py_INCREF(self);
4015 Py_DECREF(u);
4016 return (PyObject*) self;
4017 }
4018 return (PyObject*) u;
4019}
4020
Tim Petersced69f82003-09-16 20:30:58 +00004021static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022int fixupper(PyUnicodeObject *self)
4023{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004024 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 Py_UNICODE *s = self->str;
4026 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004027
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 while (len-- > 0) {
4029 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004030
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 ch = Py_UNICODE_TOUPPER(*s);
4032 if (ch != *s) {
4033 status = 1;
4034 *s = ch;
4035 }
4036 s++;
4037 }
4038
4039 return status;
4040}
4041
Tim Petersced69f82003-09-16 20:30:58 +00004042static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043int fixlower(PyUnicodeObject *self)
4044{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004045 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 Py_UNICODE *s = self->str;
4047 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004048
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049 while (len-- > 0) {
4050 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 ch = Py_UNICODE_TOLOWER(*s);
4053 if (ch != *s) {
4054 status = 1;
4055 *s = ch;
4056 }
4057 s++;
4058 }
4059
4060 return status;
4061}
4062
Tim Petersced69f82003-09-16 20:30:58 +00004063static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064int fixswapcase(PyUnicodeObject *self)
4065{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004066 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 Py_UNICODE *s = self->str;
4068 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004069
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 while (len-- > 0) {
4071 if (Py_UNICODE_ISUPPER(*s)) {
4072 *s = Py_UNICODE_TOLOWER(*s);
4073 status = 1;
4074 } else if (Py_UNICODE_ISLOWER(*s)) {
4075 *s = Py_UNICODE_TOUPPER(*s);
4076 status = 1;
4077 }
4078 s++;
4079 }
4080
4081 return status;
4082}
4083
Tim Petersced69f82003-09-16 20:30:58 +00004084static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085int fixcapitalize(PyUnicodeObject *self)
4086{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004087 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004088 Py_UNICODE *s = self->str;
4089 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004090
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004091 if (len == 0)
4092 return 0;
4093 if (Py_UNICODE_ISLOWER(*s)) {
4094 *s = Py_UNICODE_TOUPPER(*s);
4095 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004097 s++;
4098 while (--len > 0) {
4099 if (Py_UNICODE_ISUPPER(*s)) {
4100 *s = Py_UNICODE_TOLOWER(*s);
4101 status = 1;
4102 }
4103 s++;
4104 }
4105 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106}
4107
4108static
4109int fixtitle(PyUnicodeObject *self)
4110{
4111 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4112 register Py_UNICODE *e;
4113 int previous_is_cased;
4114
4115 /* Shortcut for single character strings */
4116 if (PyUnicode_GET_SIZE(self) == 1) {
4117 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4118 if (*p != ch) {
4119 *p = ch;
4120 return 1;
4121 }
4122 else
4123 return 0;
4124 }
Tim Petersced69f82003-09-16 20:30:58 +00004125
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 e = p + PyUnicode_GET_SIZE(self);
4127 previous_is_cased = 0;
4128 for (; p < e; p++) {
4129 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004130
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 if (previous_is_cased)
4132 *p = Py_UNICODE_TOLOWER(ch);
4133 else
4134 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004135
4136 if (Py_UNICODE_ISLOWER(ch) ||
4137 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 Py_UNICODE_ISTITLE(ch))
4139 previous_is_cased = 1;
4140 else
4141 previous_is_cased = 0;
4142 }
4143 return 1;
4144}
4145
Tim Peters8ce9f162004-08-27 01:49:32 +00004146PyObject *
4147PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148{
Tim Peters8ce9f162004-08-27 01:49:32 +00004149 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004150 const Py_UNICODE blank = ' ';
4151 const Py_UNICODE *sep = &blank;
4152 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004153 PyUnicodeObject *res = NULL; /* the result */
4154 size_t res_alloc = 100; /* # allocated bytes for string in res */
4155 size_t res_used; /* # used bytes */
4156 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4157 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004158 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004159 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160 int i;
4161
Tim Peters05eba1f2004-08-27 21:32:02 +00004162 fseq = PySequence_Fast(seq, "");
4163 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004164 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004165 }
4166
Tim Peters91879ab2004-08-27 22:35:44 +00004167 /* Grrrr. A codec may be invoked to convert str objects to
4168 * Unicode, and so it's possible to call back into Python code
4169 * during PyUnicode_FromObject(), and so it's possible for a sick
4170 * codec to change the size of fseq (if seq is a list). Therefore
4171 * we have to keep refetching the size -- can't assume seqlen
4172 * is invariant.
4173 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004174 seqlen = PySequence_Fast_GET_SIZE(fseq);
4175 /* If empty sequence, return u"". */
4176 if (seqlen == 0) {
4177 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4178 goto Done;
4179 }
4180 /* If singleton sequence with an exact Unicode, return that. */
4181 if (seqlen == 1) {
4182 item = PySequence_Fast_GET_ITEM(fseq, 0);
4183 if (PyUnicode_CheckExact(item)) {
4184 Py_INCREF(item);
4185 res = (PyUnicodeObject *)item;
4186 goto Done;
4187 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004188 }
4189
Tim Peters05eba1f2004-08-27 21:32:02 +00004190 /* At least two items to join, or one that isn't exact Unicode. */
4191 if (seqlen > 1) {
4192 /* Set up sep and seplen -- they're needed. */
4193 if (separator == NULL) {
4194 sep = &blank;
4195 seplen = 1;
4196 }
4197 else {
4198 internal_separator = PyUnicode_FromObject(separator);
4199 if (internal_separator == NULL)
4200 goto onError;
4201 sep = PyUnicode_AS_UNICODE(internal_separator);
4202 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004203 /* In case PyUnicode_FromObject() mutated seq. */
4204 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004205 }
4206 }
4207
4208 /* Get space. */
4209 res = _PyUnicode_New((int)res_alloc);
4210 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004211 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004212 res_p = PyUnicode_AS_UNICODE(res);
4213 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004214
Tim Peters05eba1f2004-08-27 21:32:02 +00004215 for (i = 0; i < seqlen; ++i) {
4216 size_t itemlen;
4217 size_t new_res_used;
4218
4219 item = PySequence_Fast_GET_ITEM(fseq, i);
4220 /* Convert item to Unicode. */
4221 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4222 PyErr_Format(PyExc_TypeError,
4223 "sequence item %i: expected string or Unicode,"
4224 " %.80s found",
4225 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004226 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004227 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004228 item = PyUnicode_FromObject(item);
4229 if (item == NULL)
4230 goto onError;
4231 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004232
Tim Peters91879ab2004-08-27 22:35:44 +00004233 /* In case PyUnicode_FromObject() mutated seq. */
4234 seqlen = PySequence_Fast_GET_SIZE(fseq);
4235
Tim Peters8ce9f162004-08-27 01:49:32 +00004236 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004238 new_res_used = res_used + itemlen;
4239 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004240 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004241 if (i < seqlen - 1) {
4242 new_res_used += seplen;
4243 if (new_res_used < res_used || new_res_used > INT_MAX)
4244 goto Overflow;
4245 }
4246 if (new_res_used > res_alloc) {
4247 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004248 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004249 size_t oldsize = res_alloc;
4250 res_alloc += res_alloc;
4251 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004252 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004253 } while (new_res_used > res_alloc);
4254 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004255 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004257 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004258 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004260
4261 /* Copy item, and maybe the separator. */
4262 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4263 res_p += itemlen;
4264 if (i < seqlen - 1) {
4265 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4266 res_p += seplen;
4267 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004269 res_used = new_res_used;
4270 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004271
Tim Peters05eba1f2004-08-27 21:32:02 +00004272 /* Shrink res to match the used area; this probably can't fail,
4273 * but it's cheap to check.
4274 */
4275 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004276 goto onError;
4277
4278 Done:
4279 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004280 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281 return (PyObject *)res;
4282
Tim Peters8ce9f162004-08-27 01:49:32 +00004283 Overflow:
4284 PyErr_SetString(PyExc_OverflowError,
4285 "join() is too long for a Python string");
4286 Py_DECREF(item);
4287 /* fall through */
4288
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004290 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004291 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004292 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 return NULL;
4294}
4295
Tim Petersced69f82003-09-16 20:30:58 +00004296static
4297PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004298 Py_ssize_t left,
4299 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 Py_UNICODE fill)
4301{
4302 PyUnicodeObject *u;
4303
4304 if (left < 0)
4305 left = 0;
4306 if (right < 0)
4307 right = 0;
4308
Tim Peters7a29bd52001-09-12 03:03:31 +00004309 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 Py_INCREF(self);
4311 return self;
4312 }
4313
4314 u = _PyUnicode_New(left + self->length + right);
4315 if (u) {
4316 if (left)
4317 Py_UNICODE_FILL(u->str, fill, left);
4318 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4319 if (right)
4320 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4321 }
4322
4323 return u;
4324}
4325
4326#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004327 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 if (!str) \
4329 goto onError; \
4330 if (PyList_Append(list, str)) { \
4331 Py_DECREF(str); \
4332 goto onError; \
4333 } \
4334 else \
4335 Py_DECREF(str);
4336
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004337#define SPLIT_INSERT(data, left, right) \
4338 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4339 if (!str) \
4340 goto onError; \
4341 if (PyList_Insert(list, 0, str)) { \
4342 Py_DECREF(str); \
4343 goto onError; \
4344 } \
4345 else \
4346 Py_DECREF(str);
4347
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348static
4349PyObject *split_whitespace(PyUnicodeObject *self,
4350 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004351 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004353 register Py_ssize_t i;
4354 register Py_ssize_t j;
4355 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356 PyObject *str;
4357
4358 for (i = j = 0; i < len; ) {
4359 /* find a token */
4360 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4361 i++;
4362 j = i;
4363 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4364 i++;
4365 if (j < i) {
4366 if (maxcount-- <= 0)
4367 break;
4368 SPLIT_APPEND(self->str, j, i);
4369 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4370 i++;
4371 j = i;
4372 }
4373 }
4374 if (j < len) {
4375 SPLIT_APPEND(self->str, j, len);
4376 }
4377 return list;
4378
4379 onError:
4380 Py_DECREF(list);
4381 return NULL;
4382}
4383
4384PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004385 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004387 register Py_ssize_t i;
4388 register Py_ssize_t j;
4389 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 PyObject *list;
4391 PyObject *str;
4392 Py_UNICODE *data;
4393
4394 string = PyUnicode_FromObject(string);
4395 if (string == NULL)
4396 return NULL;
4397 data = PyUnicode_AS_UNICODE(string);
4398 len = PyUnicode_GET_SIZE(string);
4399
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 list = PyList_New(0);
4401 if (!list)
4402 goto onError;
4403
4404 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004405 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004406
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407 /* Find a line and append it */
4408 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4409 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410
4411 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004412 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 if (i < len) {
4414 if (data[i] == '\r' && i + 1 < len &&
4415 data[i+1] == '\n')
4416 i += 2;
4417 else
4418 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004419 if (keepends)
4420 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 }
Guido van Rossum86662912000-04-11 15:38:46 +00004422 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 j = i;
4424 }
4425 if (j < len) {
4426 SPLIT_APPEND(data, j, len);
4427 }
4428
4429 Py_DECREF(string);
4430 return list;
4431
4432 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004433 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 Py_DECREF(string);
4435 return NULL;
4436}
4437
Tim Petersced69f82003-09-16 20:30:58 +00004438static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439PyObject *split_char(PyUnicodeObject *self,
4440 PyObject *list,
4441 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004442 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004444 register Py_ssize_t i;
4445 register Py_ssize_t j;
4446 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 PyObject *str;
4448
4449 for (i = j = 0; i < len; ) {
4450 if (self->str[i] == ch) {
4451 if (maxcount-- <= 0)
4452 break;
4453 SPLIT_APPEND(self->str, j, i);
4454 i = j = i + 1;
4455 } else
4456 i++;
4457 }
4458 if (j <= len) {
4459 SPLIT_APPEND(self->str, j, len);
4460 }
4461 return list;
4462
4463 onError:
4464 Py_DECREF(list);
4465 return NULL;
4466}
4467
Tim Petersced69f82003-09-16 20:30:58 +00004468static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469PyObject *split_substring(PyUnicodeObject *self,
4470 PyObject *list,
4471 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004472 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004474 register Py_ssize_t i;
4475 register Py_ssize_t j;
4476 Py_ssize_t len = self->length;
4477 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 PyObject *str;
4479
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004480 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 if (Py_UNICODE_MATCH(self, i, substring)) {
4482 if (maxcount-- <= 0)
4483 break;
4484 SPLIT_APPEND(self->str, j, i);
4485 i = j = i + sublen;
4486 } else
4487 i++;
4488 }
4489 if (j <= len) {
4490 SPLIT_APPEND(self->str, j, len);
4491 }
4492 return list;
4493
4494 onError:
4495 Py_DECREF(list);
4496 return NULL;
4497}
4498
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004499static
4500PyObject *rsplit_whitespace(PyUnicodeObject *self,
4501 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004502 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004503{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004504 register Py_ssize_t i;
4505 register Py_ssize_t j;
4506 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004507 PyObject *str;
4508
4509 for (i = j = len - 1; i >= 0; ) {
4510 /* find a token */
4511 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4512 i--;
4513 j = i;
4514 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4515 i--;
4516 if (j > i) {
4517 if (maxcount-- <= 0)
4518 break;
4519 SPLIT_INSERT(self->str, i + 1, j + 1);
4520 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4521 i--;
4522 j = i;
4523 }
4524 }
4525 if (j >= 0) {
4526 SPLIT_INSERT(self->str, 0, j + 1);
4527 }
4528 return list;
4529
4530 onError:
4531 Py_DECREF(list);
4532 return NULL;
4533}
4534
4535static
4536PyObject *rsplit_char(PyUnicodeObject *self,
4537 PyObject *list,
4538 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004539 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004540{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004541 register Py_ssize_t i;
4542 register Py_ssize_t j;
4543 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004544 PyObject *str;
4545
4546 for (i = j = len - 1; i >= 0; ) {
4547 if (self->str[i] == ch) {
4548 if (maxcount-- <= 0)
4549 break;
4550 SPLIT_INSERT(self->str, i + 1, j + 1);
4551 j = i = i - 1;
4552 } else
4553 i--;
4554 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004555 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004556 SPLIT_INSERT(self->str, 0, j + 1);
4557 }
4558 return list;
4559
4560 onError:
4561 Py_DECREF(list);
4562 return NULL;
4563}
4564
4565static
4566PyObject *rsplit_substring(PyUnicodeObject *self,
4567 PyObject *list,
4568 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004569 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004570{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004571 register Py_ssize_t i;
4572 register Py_ssize_t j;
4573 Py_ssize_t len = self->length;
4574 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004575 PyObject *str;
4576
4577 for (i = len - sublen, j = len; i >= 0; ) {
4578 if (Py_UNICODE_MATCH(self, i, substring)) {
4579 if (maxcount-- <= 0)
4580 break;
4581 SPLIT_INSERT(self->str, i + sublen, j);
4582 j = i;
4583 i -= sublen;
4584 } else
4585 i--;
4586 }
4587 if (j >= 0) {
4588 SPLIT_INSERT(self->str, 0, j);
4589 }
4590 return list;
4591
4592 onError:
4593 Py_DECREF(list);
4594 return NULL;
4595}
4596
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004598#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599
4600static
4601PyObject *split(PyUnicodeObject *self,
4602 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004603 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604{
4605 PyObject *list;
4606
4607 if (maxcount < 0)
4608 maxcount = INT_MAX;
4609
4610 list = PyList_New(0);
4611 if (!list)
4612 return NULL;
4613
4614 if (substring == NULL)
4615 return split_whitespace(self,list,maxcount);
4616
4617 else if (substring->length == 1)
4618 return split_char(self,list,substring->str[0],maxcount);
4619
4620 else if (substring->length == 0) {
4621 Py_DECREF(list);
4622 PyErr_SetString(PyExc_ValueError, "empty separator");
4623 return NULL;
4624 }
4625 else
4626 return split_substring(self,list,substring,maxcount);
4627}
4628
Tim Petersced69f82003-09-16 20:30:58 +00004629static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004630PyObject *rsplit(PyUnicodeObject *self,
4631 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004632 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004633{
4634 PyObject *list;
4635
4636 if (maxcount < 0)
4637 maxcount = INT_MAX;
4638
4639 list = PyList_New(0);
4640 if (!list)
4641 return NULL;
4642
4643 if (substring == NULL)
4644 return rsplit_whitespace(self,list,maxcount);
4645
4646 else if (substring->length == 1)
4647 return rsplit_char(self,list,substring->str[0],maxcount);
4648
4649 else if (substring->length == 0) {
4650 Py_DECREF(list);
4651 PyErr_SetString(PyExc_ValueError, "empty separator");
4652 return NULL;
4653 }
4654 else
4655 return rsplit_substring(self,list,substring,maxcount);
4656}
4657
4658static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659PyObject *replace(PyUnicodeObject *self,
4660 PyUnicodeObject *str1,
4661 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004662 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663{
4664 PyUnicodeObject *u;
4665
4666 if (maxcount < 0)
4667 maxcount = INT_MAX;
4668
4669 if (str1->length == 1 && str2->length == 1) {
4670 int i;
4671
4672 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004673 if (!findchar(self->str, self->length, str1->str[0]) &&
4674 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 /* nothing to replace, return original string */
4676 Py_INCREF(self);
4677 u = self;
4678 } else {
4679 Py_UNICODE u1 = str1->str[0];
4680 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004681
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004683 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 self->length
4685 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004686 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004687 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004688 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 for (i = 0; i < u->length; i++)
4690 if (u->str[i] == u1) {
4691 if (--maxcount < 0)
4692 break;
4693 u->str[i] = u2;
4694 }
4695 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697
4698 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004699 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700 Py_UNICODE *p;
4701
4702 /* replace strings */
4703 n = count(self, 0, self->length, str1);
4704 if (n > maxcount)
4705 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004706 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004708 if (PyUnicode_CheckExact(self)) {
4709 Py_INCREF(self);
4710 u = self;
4711 }
4712 else {
4713 u = (PyUnicodeObject *)
4714 PyUnicode_FromUnicode(self->str, self->length);
4715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716 } else {
4717 u = _PyUnicode_New(
4718 self->length + n * (str2->length - str1->length));
4719 if (u) {
4720 i = 0;
4721 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004722 if (str1->length > 0) {
4723 while (i <= self->length - str1->length)
4724 if (Py_UNICODE_MATCH(self, i, str1)) {
4725 /* replace string segment */
4726 Py_UNICODE_COPY(p, str2->str, str2->length);
4727 p += str2->length;
4728 i += str1->length;
4729 if (--n <= 0) {
4730 /* copy remaining part */
4731 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4732 break;
4733 }
4734 } else
4735 *p++ = self->str[i++];
4736 } else {
4737 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 Py_UNICODE_COPY(p, str2->str, str2->length);
4739 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004740 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004743 }
4744 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 }
4747 }
4748 }
Tim Petersced69f82003-09-16 20:30:58 +00004749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 return (PyObject *) u;
4751}
4752
4753/* --- Unicode Object Methods --------------------------------------------- */
4754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004755PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756"S.title() -> unicode\n\
4757\n\
4758Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004759characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760
4761static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004762unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 return fixup(self, fixtitle);
4765}
4766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004767PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768"S.capitalize() -> unicode\n\
4769\n\
4770Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004771have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772
4773static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004774unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 return fixup(self, fixcapitalize);
4777}
4778
4779#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004780PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781"S.capwords() -> unicode\n\
4782\n\
4783Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004784normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785
4786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004787unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788{
4789 PyObject *list;
4790 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 /* Split into words */
4794 list = split(self, NULL, -1);
4795 if (!list)
4796 return NULL;
4797
4798 /* Capitalize each word */
4799 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4800 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4801 fixcapitalize);
4802 if (item == NULL)
4803 goto onError;
4804 Py_DECREF(PyList_GET_ITEM(list, i));
4805 PyList_SET_ITEM(list, i, item);
4806 }
4807
4808 /* Join the words to form a new string */
4809 item = PyUnicode_Join(NULL, list);
4810
4811onError:
4812 Py_DECREF(list);
4813 return (PyObject *)item;
4814}
4815#endif
4816
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004817/* Argument converter. Coerces to a single unicode character */
4818
4819static int
4820convert_uc(PyObject *obj, void *addr)
4821{
4822 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4823 PyObject *uniobj;
4824 Py_UNICODE *unistr;
4825
4826 uniobj = PyUnicode_FromObject(obj);
4827 if (uniobj == NULL) {
4828 PyErr_SetString(PyExc_TypeError,
4829 "The fill character cannot be converted to Unicode");
4830 return 0;
4831 }
4832 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4833 PyErr_SetString(PyExc_TypeError,
4834 "The fill character must be exactly one character long");
4835 Py_DECREF(uniobj);
4836 return 0;
4837 }
4838 unistr = PyUnicode_AS_UNICODE(uniobj);
4839 *fillcharloc = unistr[0];
4840 Py_DECREF(uniobj);
4841 return 1;
4842}
4843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004844PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004845"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004847Return S centered in a Unicode string of length width. Padding is\n\
4848done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849
4850static PyObject *
4851unicode_center(PyUnicodeObject *self, PyObject *args)
4852{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004853 Py_ssize_t marg, left;
4854 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004855 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856
Thomas Woutersde017742006-02-16 19:34:37 +00004857 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 return NULL;
4859
Tim Peters7a29bd52001-09-12 03:03:31 +00004860 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 Py_INCREF(self);
4862 return (PyObject*) self;
4863 }
4864
4865 marg = width - self->length;
4866 left = marg / 2 + (marg & width & 1);
4867
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004868 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869}
4870
Marc-André Lemburge5034372000-08-08 08:04:29 +00004871#if 0
4872
4873/* This code should go into some future Unicode collation support
4874 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004875 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004876
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004877/* speedy UTF-16 code point order comparison */
4878/* gleaned from: */
4879/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4880
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004881static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004882{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004883 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004884 0, 0, 0, 0, 0, 0, 0, 0,
4885 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004886 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004887};
4888
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889static int
4890unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4891{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004892 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004893
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 Py_UNICODE *s1 = str1->str;
4895 Py_UNICODE *s2 = str2->str;
4896
4897 len1 = str1->length;
4898 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004899
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004901 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004902
4903 c1 = *s1++;
4904 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004905
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004906 if (c1 > (1<<11) * 26)
4907 c1 += utf16Fixup[c1>>11];
4908 if (c2 > (1<<11) * 26)
4909 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004910 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004911
4912 if (c1 != c2)
4913 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004914
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004915 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 }
4917
4918 return (len1 < len2) ? -1 : (len1 != len2);
4919}
4920
Marc-André Lemburge5034372000-08-08 08:04:29 +00004921#else
4922
4923static int
4924unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4925{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004926 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004927
4928 Py_UNICODE *s1 = str1->str;
4929 Py_UNICODE *s2 = str2->str;
4930
4931 len1 = str1->length;
4932 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004933
Marc-André Lemburge5034372000-08-08 08:04:29 +00004934 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004935 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004936
Fredrik Lundh45714e92001-06-26 16:39:36 +00004937 c1 = *s1++;
4938 c2 = *s2++;
4939
4940 if (c1 != c2)
4941 return (c1 < c2) ? -1 : 1;
4942
Marc-André Lemburge5034372000-08-08 08:04:29 +00004943 len1--; len2--;
4944 }
4945
4946 return (len1 < len2) ? -1 : (len1 != len2);
4947}
4948
4949#endif
4950
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951int PyUnicode_Compare(PyObject *left,
4952 PyObject *right)
4953{
4954 PyUnicodeObject *u = NULL, *v = NULL;
4955 int result;
4956
4957 /* Coerce the two arguments */
4958 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4959 if (u == NULL)
4960 goto onError;
4961 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4962 if (v == NULL)
4963 goto onError;
4964
Thomas Wouters7e474022000-07-16 12:04:32 +00004965 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966 if (v == u) {
4967 Py_DECREF(u);
4968 Py_DECREF(v);
4969 return 0;
4970 }
4971
4972 result = unicode_compare(u, v);
4973
4974 Py_DECREF(u);
4975 Py_DECREF(v);
4976 return result;
4977
4978onError:
4979 Py_XDECREF(u);
4980 Py_XDECREF(v);
4981 return -1;
4982}
4983
Guido van Rossum403d68b2000-03-13 15:55:09 +00004984int PyUnicode_Contains(PyObject *container,
4985 PyObject *element)
4986{
4987 PyUnicodeObject *u = NULL, *v = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004988 int result;
4989 Py_ssize_t size;
Barry Warsaw817918c2002-08-06 16:58:21 +00004990 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004991
4992 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004993 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004994 if (v == NULL) {
4995 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004996 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004997 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004998 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004999 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00005000 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005001 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005002
Barry Warsaw817918c2002-08-06 16:58:21 +00005003 size = PyUnicode_GET_SIZE(v);
5004 rhs = PyUnicode_AS_UNICODE(v);
5005 lhs = PyUnicode_AS_UNICODE(u);
5006
Guido van Rossum403d68b2000-03-13 15:55:09 +00005007 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00005008 if (size == 1) {
5009 end = lhs + PyUnicode_GET_SIZE(u);
5010 while (lhs < end) {
5011 if (*lhs++ == *rhs) {
5012 result = 1;
5013 break;
5014 }
5015 }
5016 }
5017 else {
5018 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5019 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005020 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005021 result = 1;
5022 break;
5023 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005024 }
5025 }
5026
5027 Py_DECREF(u);
5028 Py_DECREF(v);
5029 return result;
5030
5031onError:
5032 Py_XDECREF(u);
5033 Py_XDECREF(v);
5034 return -1;
5035}
5036
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037/* Concat to string or Unicode object giving a new Unicode object. */
5038
5039PyObject *PyUnicode_Concat(PyObject *left,
5040 PyObject *right)
5041{
5042 PyUnicodeObject *u = NULL, *v = NULL, *w;
5043
5044 /* Coerce the two arguments */
5045 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5046 if (u == NULL)
5047 goto onError;
5048 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5049 if (v == NULL)
5050 goto onError;
5051
5052 /* Shortcuts */
5053 if (v == unicode_empty) {
5054 Py_DECREF(v);
5055 return (PyObject *)u;
5056 }
5057 if (u == unicode_empty) {
5058 Py_DECREF(u);
5059 return (PyObject *)v;
5060 }
5061
5062 /* Concat the two Unicode strings */
5063 w = _PyUnicode_New(u->length + v->length);
5064 if (w == NULL)
5065 goto onError;
5066 Py_UNICODE_COPY(w->str, u->str, u->length);
5067 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5068
5069 Py_DECREF(u);
5070 Py_DECREF(v);
5071 return (PyObject *)w;
5072
5073onError:
5074 Py_XDECREF(u);
5075 Py_XDECREF(v);
5076 return NULL;
5077}
5078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005079PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080"S.count(sub[, start[, end]]) -> int\n\
5081\n\
5082Return the number of occurrences of substring sub in Unicode string\n\
5083S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005084interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085
5086static PyObject *
5087unicode_count(PyUnicodeObject *self, PyObject *args)
5088{
5089 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005090 Py_ssize_t start = 0;
5091 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 PyObject *result;
5093
Guido van Rossumb8872e62000-05-09 14:14:27 +00005094 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5095 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 return NULL;
5097
5098 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5099 (PyObject *)substring);
5100 if (substring == NULL)
5101 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005102
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 if (start < 0)
5104 start += self->length;
5105 if (start < 0)
5106 start = 0;
5107 if (end > self->length)
5108 end = self->length;
5109 if (end < 0)
5110 end += self->length;
5111 if (end < 0)
5112 end = 0;
5113
5114 result = PyInt_FromLong((long) count(self, start, end, substring));
5115
5116 Py_DECREF(substring);
5117 return result;
5118}
5119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005120PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005121"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005123Encodes S using the codec registered for encoding. encoding defaults\n\
5124to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005125handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5127'xmlcharrefreplace' as well as any other name registered with\n\
5128codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129
5130static PyObject *
5131unicode_encode(PyUnicodeObject *self, PyObject *args)
5132{
5133 char *encoding = NULL;
5134 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005135 PyObject *v;
5136
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5138 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005139 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005140 if (v == NULL)
5141 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005142 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5143 PyErr_Format(PyExc_TypeError,
5144 "encoder did not return a string/unicode object "
5145 "(type=%.400s)",
5146 v->ob_type->tp_name);
5147 Py_DECREF(v);
5148 return NULL;
5149 }
5150 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005151
5152 onError:
5153 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005154}
5155
5156PyDoc_STRVAR(decode__doc__,
5157"S.decode([encoding[,errors]]) -> string or unicode\n\
5158\n\
5159Decodes S using the codec registered for encoding. encoding defaults\n\
5160to the default encoding. errors may be given to set a different error\n\
5161handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5162a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5163as well as any other name registerd with codecs.register_error that is\n\
5164able to handle UnicodeDecodeErrors.");
5165
5166static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005167unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005168{
5169 char *encoding = NULL;
5170 char *errors = NULL;
5171 PyObject *v;
5172
5173 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5174 return NULL;
5175 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005176 if (v == NULL)
5177 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005178 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5179 PyErr_Format(PyExc_TypeError,
5180 "decoder did not return a string/unicode object "
5181 "(type=%.400s)",
5182 v->ob_type->tp_name);
5183 Py_DECREF(v);
5184 return NULL;
5185 }
5186 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005187
5188 onError:
5189 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190}
5191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005192PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193"S.expandtabs([tabsize]) -> unicode\n\
5194\n\
5195Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005196If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197
5198static PyObject*
5199unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5200{
5201 Py_UNICODE *e;
5202 Py_UNICODE *p;
5203 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 PyUnicodeObject *u;
5206 int tabsize = 8;
5207
5208 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5209 return NULL;
5210
Thomas Wouters7e474022000-07-16 12:04:32 +00005211 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 i = j = 0;
5213 e = self->str + self->length;
5214 for (p = self->str; p < e; p++)
5215 if (*p == '\t') {
5216 if (tabsize > 0)
5217 j += tabsize - (j % tabsize);
5218 }
5219 else {
5220 j++;
5221 if (*p == '\n' || *p == '\r') {
5222 i += j;
5223 j = 0;
5224 }
5225 }
5226
5227 /* Second pass: create output string and fill it */
5228 u = _PyUnicode_New(i + j);
5229 if (!u)
5230 return NULL;
5231
5232 j = 0;
5233 q = u->str;
5234
5235 for (p = self->str; p < e; p++)
5236 if (*p == '\t') {
5237 if (tabsize > 0) {
5238 i = tabsize - (j % tabsize);
5239 j += i;
5240 while (i--)
5241 *q++ = ' ';
5242 }
5243 }
5244 else {
5245 j++;
5246 *q++ = *p;
5247 if (*p == '\n' || *p == '\r')
5248 j = 0;
5249 }
5250
5251 return (PyObject*) u;
5252}
5253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005254PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255"S.find(sub [,start [,end]]) -> int\n\
5256\n\
5257Return the lowest index in S where substring sub is found,\n\
5258such that sub is contained within s[start,end]. Optional\n\
5259arguments start and end are interpreted as in slice notation.\n\
5260\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005261Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262
5263static PyObject *
5264unicode_find(PyUnicodeObject *self, PyObject *args)
5265{
5266 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t start = 0;
5268 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 PyObject *result;
5270
Guido van Rossumb8872e62000-05-09 14:14:27 +00005271 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5272 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 return NULL;
5274 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5275 (PyObject *)substring);
5276 if (substring == NULL)
5277 return NULL;
5278
Martin v. Löwis18e16552006-02-15 17:27:45 +00005279 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280
5281 Py_DECREF(substring);
5282 return result;
5283}
5284
5285static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005286unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287{
5288 if (index < 0 || index >= self->length) {
5289 PyErr_SetString(PyExc_IndexError, "string index out of range");
5290 return NULL;
5291 }
5292
5293 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5294}
5295
5296static long
5297unicode_hash(PyUnicodeObject *self)
5298{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005299 /* Since Unicode objects compare equal to their ASCII string
5300 counterparts, they should use the individual character values
5301 as basis for their hash value. This is needed to assure that
5302 strings and Unicode objects behave in the same way as
5303 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
Martin v. Löwis18e16552006-02-15 17:27:45 +00005305 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005306 register Py_UNICODE *p;
5307 register long x;
5308
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 if (self->hash != -1)
5310 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005311 len = PyUnicode_GET_SIZE(self);
5312 p = PyUnicode_AS_UNICODE(self);
5313 x = *p << 7;
5314 while (--len >= 0)
5315 x = (1000003*x) ^ *p++;
5316 x ^= PyUnicode_GET_SIZE(self);
5317 if (x == -1)
5318 x = -2;
5319 self->hash = x;
5320 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321}
5322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005323PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324"S.index(sub [,start [,end]]) -> int\n\
5325\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005326Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327
5328static PyObject *
5329unicode_index(PyUnicodeObject *self, PyObject *args)
5330{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005331 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005333 Py_ssize_t start = 0;
5334 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335
Guido van Rossumb8872e62000-05-09 14:14:27 +00005336 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5337 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005339
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5341 (PyObject *)substring);
5342 if (substring == NULL)
5343 return NULL;
5344
5345 result = findstring(self, substring, start, end, 1);
5346
5347 Py_DECREF(substring);
5348 if (result < 0) {
5349 PyErr_SetString(PyExc_ValueError, "substring not found");
5350 return NULL;
5351 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353}
5354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005355PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005356"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005358Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005359at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360
5361static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005362unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363{
5364 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5365 register const Py_UNICODE *e;
5366 int cased;
5367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 /* Shortcut for single character strings */
5369 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005370 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005372 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005373 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005374 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005375
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 e = p + PyUnicode_GET_SIZE(self);
5377 cased = 0;
5378 for (; p < e; p++) {
5379 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005380
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005382 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 else if (!cased && Py_UNICODE_ISLOWER(ch))
5384 cased = 1;
5385 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005386 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387}
5388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005389PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005390"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005392Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005393at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394
5395static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005396unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397{
5398 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5399 register const Py_UNICODE *e;
5400 int cased;
5401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 /* Shortcut for single character strings */
5403 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005404 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005406 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005407 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005408 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 e = p + PyUnicode_GET_SIZE(self);
5411 cased = 0;
5412 for (; p < e; p++) {
5413 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005414
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005416 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 else if (!cased && Py_UNICODE_ISUPPER(ch))
5418 cased = 1;
5419 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005420 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421}
5422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005423PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005424"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005426Return True if S is a titlecased string and there is at least one\n\
5427character in S, i.e. upper- and titlecase characters may only\n\
5428follow uncased characters and lowercase characters only cased ones.\n\
5429Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430
5431static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005432unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433{
5434 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5435 register const Py_UNICODE *e;
5436 int cased, previous_is_cased;
5437
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 /* Shortcut for single character strings */
5439 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005440 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5441 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005443 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005444 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005445 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005446
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 e = p + PyUnicode_GET_SIZE(self);
5448 cased = 0;
5449 previous_is_cased = 0;
5450 for (; p < e; p++) {
5451 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5454 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005455 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 previous_is_cased = 1;
5457 cased = 1;
5458 }
5459 else if (Py_UNICODE_ISLOWER(ch)) {
5460 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005461 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 previous_is_cased = 1;
5463 cased = 1;
5464 }
5465 else
5466 previous_is_cased = 0;
5467 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005468 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469}
5470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005471PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005472"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005474Return True if all characters in S are whitespace\n\
5475and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476
5477static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005478unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479{
5480 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5481 register const Py_UNICODE *e;
5482
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 /* Shortcut for single character strings */
5484 if (PyUnicode_GET_SIZE(self) == 1 &&
5485 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005486 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005488 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005489 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005490 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005491
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 e = p + PyUnicode_GET_SIZE(self);
5493 for (; p < e; p++) {
5494 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005495 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005497 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498}
5499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005500PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005501"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005502\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005503Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005504and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005505
5506static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005507unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005508{
5509 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5510 register const Py_UNICODE *e;
5511
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005512 /* Shortcut for single character strings */
5513 if (PyUnicode_GET_SIZE(self) == 1 &&
5514 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005515 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005516
5517 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005518 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005519 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005520
5521 e = p + PyUnicode_GET_SIZE(self);
5522 for (; p < e; p++) {
5523 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005524 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005525 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005526 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005527}
5528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005529PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005530"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005531\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005532Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005533and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005534
5535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005536unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005537{
5538 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5539 register const Py_UNICODE *e;
5540
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005541 /* Shortcut for single character strings */
5542 if (PyUnicode_GET_SIZE(self) == 1 &&
5543 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005544 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005545
5546 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005547 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005548 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005549
5550 e = p + PyUnicode_GET_SIZE(self);
5551 for (; p < e; p++) {
5552 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005553 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005554 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005555 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005556}
5557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005558PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005559"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005561Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005562False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
5564static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005565unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566{
5567 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5568 register const Py_UNICODE *e;
5569
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 /* Shortcut for single character strings */
5571 if (PyUnicode_GET_SIZE(self) == 1 &&
5572 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005573 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005575 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005576 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005577 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005578
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 e = p + PyUnicode_GET_SIZE(self);
5580 for (; p < e; p++) {
5581 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005582 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005584 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585}
5586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005587PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005588"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005590Return True if all characters in S are digits\n\
5591and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592
5593static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005594unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595{
5596 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5597 register const Py_UNICODE *e;
5598
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 /* Shortcut for single character strings */
5600 if (PyUnicode_GET_SIZE(self) == 1 &&
5601 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005602 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005604 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005605 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005606 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005607
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 e = p + PyUnicode_GET_SIZE(self);
5609 for (; p < e; p++) {
5610 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005611 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005613 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614}
5615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005616PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005617"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005619Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005620False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621
5622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005623unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624{
5625 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5626 register const Py_UNICODE *e;
5627
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 /* Shortcut for single character strings */
5629 if (PyUnicode_GET_SIZE(self) == 1 &&
5630 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005631 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005633 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005634 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005635 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005636
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 e = p + PyUnicode_GET_SIZE(self);
5638 for (; p < e; p++) {
5639 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005640 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005642 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643}
5644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005645PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646"S.join(sequence) -> unicode\n\
5647\n\
5648Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005649sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
5651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005652unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005654 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655}
5656
Martin v. Löwis18e16552006-02-15 17:27:45 +00005657static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658unicode_length(PyUnicodeObject *self)
5659{
5660 return self->length;
5661}
5662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005663PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005664"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665\n\
5666Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005667done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668
5669static PyObject *
5670unicode_ljust(PyUnicodeObject *self, PyObject *args)
5671{
5672 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005673 Py_UNICODE fillchar = ' ';
5674
5675 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 return NULL;
5677
Tim Peters7a29bd52001-09-12 03:03:31 +00005678 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 Py_INCREF(self);
5680 return (PyObject*) self;
5681 }
5682
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005683 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684}
5685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005686PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687"S.lower() -> unicode\n\
5688\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005689Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690
5691static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005692unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 return fixup(self, fixlower);
5695}
5696
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005697#define LEFTSTRIP 0
5698#define RIGHTSTRIP 1
5699#define BOTHSTRIP 2
5700
5701/* Arrays indexed by above */
5702static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5703
5704#define STRIPNAME(i) (stripformat[i]+3)
5705
5706static const Py_UNICODE *
5707unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5708{
Tim Peters030a5ce2002-04-22 19:00:10 +00005709 size_t i;
5710 for (i = 0; i < n; ++i)
5711 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005712 return s+i;
5713 return NULL;
5714}
5715
5716/* externally visible for str.strip(unicode) */
5717PyObject *
5718_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5719{
5720 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005721 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005722 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005723 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5724 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005725
5726 i = 0;
5727 if (striptype != RIGHTSTRIP) {
5728 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5729 i++;
5730 }
5731 }
5732
5733 j = len;
5734 if (striptype != LEFTSTRIP) {
5735 do {
5736 j--;
5737 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5738 j++;
5739 }
5740
5741 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5742 Py_INCREF(self);
5743 return (PyObject*)self;
5744 }
5745 else
5746 return PyUnicode_FromUnicode(s+i, j-i);
5747}
5748
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
5750static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005751do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005753 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005755
5756 i = 0;
5757 if (striptype != RIGHTSTRIP) {
5758 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5759 i++;
5760 }
5761 }
5762
5763 j = len;
5764 if (striptype != LEFTSTRIP) {
5765 do {
5766 j--;
5767 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5768 j++;
5769 }
5770
5771 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5772 Py_INCREF(self);
5773 return (PyObject*)self;
5774 }
5775 else
5776 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777}
5778
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005779
5780static PyObject *
5781do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5782{
5783 PyObject *sep = NULL;
5784
5785 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5786 return NULL;
5787
5788 if (sep != NULL && sep != Py_None) {
5789 if (PyUnicode_Check(sep))
5790 return _PyUnicode_XStrip(self, striptype, sep);
5791 else if (PyString_Check(sep)) {
5792 PyObject *res;
5793 sep = PyUnicode_FromObject(sep);
5794 if (sep==NULL)
5795 return NULL;
5796 res = _PyUnicode_XStrip(self, striptype, sep);
5797 Py_DECREF(sep);
5798 return res;
5799 }
5800 else {
5801 PyErr_Format(PyExc_TypeError,
5802 "%s arg must be None, unicode or str",
5803 STRIPNAME(striptype));
5804 return NULL;
5805 }
5806 }
5807
5808 return do_strip(self, striptype);
5809}
5810
5811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005812PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005813"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005814\n\
5815Return a copy of the string S with leading and trailing\n\
5816whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005817If chars is given and not None, remove characters in chars instead.\n\
5818If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005819
5820static PyObject *
5821unicode_strip(PyUnicodeObject *self, PyObject *args)
5822{
5823 if (PyTuple_GET_SIZE(args) == 0)
5824 return do_strip(self, BOTHSTRIP); /* Common case */
5825 else
5826 return do_argstrip(self, BOTHSTRIP, args);
5827}
5828
5829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005830PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005831"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005832\n\
5833Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005834If chars is given and not None, remove characters in chars instead.\n\
5835If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005836
5837static PyObject *
5838unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5839{
5840 if (PyTuple_GET_SIZE(args) == 0)
5841 return do_strip(self, LEFTSTRIP); /* Common case */
5842 else
5843 return do_argstrip(self, LEFTSTRIP, args);
5844}
5845
5846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005847PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005848"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005849\n\
5850Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005851If chars is given and not None, remove characters in chars instead.\n\
5852If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005853
5854static PyObject *
5855unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5856{
5857 if (PyTuple_GET_SIZE(args) == 0)
5858 return do_strip(self, RIGHTSTRIP); /* Common case */
5859 else
5860 return do_argstrip(self, RIGHTSTRIP, args);
5861}
5862
5863
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005865unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866{
5867 PyUnicodeObject *u;
5868 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005869 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005870 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
5872 if (len < 0)
5873 len = 0;
5874
Tim Peters7a29bd52001-09-12 03:03:31 +00005875 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 /* no repeat, return original string */
5877 Py_INCREF(str);
5878 return (PyObject*) str;
5879 }
Tim Peters8f422462000-09-09 06:13:41 +00005880
5881 /* ensure # of chars needed doesn't overflow int and # of bytes
5882 * needed doesn't overflow size_t
5883 */
5884 nchars = len * str->length;
5885 if (len && nchars / len != str->length) {
5886 PyErr_SetString(PyExc_OverflowError,
5887 "repeated string is too long");
5888 return NULL;
5889 }
5890 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5891 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5892 PyErr_SetString(PyExc_OverflowError,
5893 "repeated string is too long");
5894 return NULL;
5895 }
5896 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 if (!u)
5898 return NULL;
5899
5900 p = u->str;
5901
5902 while (len-- > 0) {
5903 Py_UNICODE_COPY(p, str->str, str->length);
5904 p += str->length;
5905 }
5906
5907 return (PyObject*) u;
5908}
5909
5910PyObject *PyUnicode_Replace(PyObject *obj,
5911 PyObject *subobj,
5912 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005913 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914{
5915 PyObject *self;
5916 PyObject *str1;
5917 PyObject *str2;
5918 PyObject *result;
5919
5920 self = PyUnicode_FromObject(obj);
5921 if (self == NULL)
5922 return NULL;
5923 str1 = PyUnicode_FromObject(subobj);
5924 if (str1 == NULL) {
5925 Py_DECREF(self);
5926 return NULL;
5927 }
5928 str2 = PyUnicode_FromObject(replobj);
5929 if (str2 == NULL) {
5930 Py_DECREF(self);
5931 Py_DECREF(str1);
5932 return NULL;
5933 }
Tim Petersced69f82003-09-16 20:30:58 +00005934 result = replace((PyUnicodeObject *)self,
5935 (PyUnicodeObject *)str1,
5936 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 maxcount);
5938 Py_DECREF(self);
5939 Py_DECREF(str1);
5940 Py_DECREF(str2);
5941 return result;
5942}
5943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005944PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945"S.replace (old, new[, maxsplit]) -> unicode\n\
5946\n\
5947Return a copy of S with all occurrences of substring\n\
5948old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005949given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
5951static PyObject*
5952unicode_replace(PyUnicodeObject *self, PyObject *args)
5953{
5954 PyUnicodeObject *str1;
5955 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005956 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 PyObject *result;
5958
Martin v. Löwis18e16552006-02-15 17:27:45 +00005959 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 return NULL;
5961 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5962 if (str1 == NULL)
5963 return NULL;
5964 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005965 if (str2 == NULL) {
5966 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969
5970 result = replace(self, str1, str2, maxcount);
5971
5972 Py_DECREF(str1);
5973 Py_DECREF(str2);
5974 return result;
5975}
5976
5977static
5978PyObject *unicode_repr(PyObject *unicode)
5979{
5980 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5981 PyUnicode_GET_SIZE(unicode),
5982 1);
5983}
5984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005985PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986"S.rfind(sub [,start [,end]]) -> int\n\
5987\n\
5988Return the highest index in S where substring sub is found,\n\
5989such that sub is contained within s[start,end]. Optional\n\
5990arguments start and end are interpreted as in slice notation.\n\
5991\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005992Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
5994static PyObject *
5995unicode_rfind(PyUnicodeObject *self, PyObject *args)
5996{
5997 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005998 Py_ssize_t start = 0;
5999 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 PyObject *result;
6001
Guido van Rossumb8872e62000-05-09 14:14:27 +00006002 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6003 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 return NULL;
6005 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6006 (PyObject *)substring);
6007 if (substring == NULL)
6008 return NULL;
6009
Martin v. Löwis18e16552006-02-15 17:27:45 +00006010 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011
6012 Py_DECREF(substring);
6013 return result;
6014}
6015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006016PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017"S.rindex(sub [,start [,end]]) -> int\n\
6018\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006019Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021static PyObject *
6022unicode_rindex(PyUnicodeObject *self, PyObject *args)
6023{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006024 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006026 Py_ssize_t start = 0;
6027 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
Guido van Rossumb8872e62000-05-09 14:14:27 +00006029 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6030 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 return NULL;
6032 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6033 (PyObject *)substring);
6034 if (substring == NULL)
6035 return NULL;
6036
6037 result = findstring(self, substring, start, end, -1);
6038
6039 Py_DECREF(substring);
6040 if (result < 0) {
6041 PyErr_SetString(PyExc_ValueError, "substring not found");
6042 return NULL;
6043 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006044 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045}
6046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006047PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006048"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049\n\
6050Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006051done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052
6053static PyObject *
6054unicode_rjust(PyUnicodeObject *self, PyObject *args)
6055{
6056 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006057 Py_UNICODE fillchar = ' ';
6058
6059 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 return NULL;
6061
Tim Peters7a29bd52001-09-12 03:03:31 +00006062 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 Py_INCREF(self);
6064 return (PyObject*) self;
6065 }
6066
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006067 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068}
6069
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006071unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072{
6073 /* standard clamping */
6074 if (start < 0)
6075 start = 0;
6076 if (end < 0)
6077 end = 0;
6078 if (end > self->length)
6079 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006080 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 /* full slice, return original string */
6082 Py_INCREF(self);
6083 return (PyObject*) self;
6084 }
6085 if (start > end)
6086 start = end;
6087 /* copy slice */
6088 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6089 end - start);
6090}
6091
6092PyObject *PyUnicode_Split(PyObject *s,
6093 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006094 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095{
6096 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006097
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 s = PyUnicode_FromObject(s);
6099 if (s == NULL)
6100 return NULL;
6101 if (sep != NULL) {
6102 sep = PyUnicode_FromObject(sep);
6103 if (sep == NULL) {
6104 Py_DECREF(s);
6105 return NULL;
6106 }
6107 }
6108
6109 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6110
6111 Py_DECREF(s);
6112 Py_XDECREF(sep);
6113 return result;
6114}
6115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006116PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117"S.split([sep [,maxsplit]]) -> list of strings\n\
6118\n\
6119Return a list of the words in S, using sep as the\n\
6120delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006121splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006122any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
6124static PyObject*
6125unicode_split(PyUnicodeObject *self, PyObject *args)
6126{
6127 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006128 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129
Martin v. Löwis18e16552006-02-15 17:27:45 +00006130 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 return NULL;
6132
6133 if (substring == Py_None)
6134 return split(self, NULL, maxcount);
6135 else if (PyUnicode_Check(substring))
6136 return split(self, (PyUnicodeObject *)substring, maxcount);
6137 else
6138 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6139}
6140
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006141PyObject *PyUnicode_RSplit(PyObject *s,
6142 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006143 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006144{
6145 PyObject *result;
6146
6147 s = PyUnicode_FromObject(s);
6148 if (s == NULL)
6149 return NULL;
6150 if (sep != NULL) {
6151 sep = PyUnicode_FromObject(sep);
6152 if (sep == NULL) {
6153 Py_DECREF(s);
6154 return NULL;
6155 }
6156 }
6157
6158 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6159
6160 Py_DECREF(s);
6161 Py_XDECREF(sep);
6162 return result;
6163}
6164
6165PyDoc_STRVAR(rsplit__doc__,
6166"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6167\n\
6168Return a list of the words in S, using sep as the\n\
6169delimiter string, starting at the end of the string and\n\
6170working to the front. If maxsplit is given, at most maxsplit\n\
6171splits are done. If sep is not specified, any whitespace string\n\
6172is a separator.");
6173
6174static PyObject*
6175unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6176{
6177 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006178 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006179
Martin v. Löwis18e16552006-02-15 17:27:45 +00006180 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006181 return NULL;
6182
6183 if (substring == Py_None)
6184 return rsplit(self, NULL, maxcount);
6185 else if (PyUnicode_Check(substring))
6186 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6187 else
6188 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6189}
6190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006191PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006192"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193\n\
6194Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006195Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006196is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
6198static PyObject*
6199unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6200{
Guido van Rossum86662912000-04-11 15:38:46 +00006201 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
Guido van Rossum86662912000-04-11 15:38:46 +00006203 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 return NULL;
6205
Guido van Rossum86662912000-04-11 15:38:46 +00006206 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207}
6208
6209static
6210PyObject *unicode_str(PyUnicodeObject *self)
6211{
Fred Drakee4315f52000-05-09 19:53:39 +00006212 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006215PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216"S.swapcase() -> unicode\n\
6217\n\
6218Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006219and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220
6221static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006222unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 return fixup(self, fixswapcase);
6225}
6226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006227PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228"S.translate(table) -> unicode\n\
6229\n\
6230Return a copy of the string S, where all characters have been mapped\n\
6231through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006232Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6233Unmapped characters are left untouched. Characters mapped to None\n\
6234are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235
6236static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006237unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238{
Tim Petersced69f82003-09-16 20:30:58 +00006239 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006241 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 "ignore");
6243}
6244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006245PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246"S.upper() -> unicode\n\
6247\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006248Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249
6250static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006251unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 return fixup(self, fixupper);
6254}
6255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006256PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257"S.zfill(width) -> unicode\n\
6258\n\
6259Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006260of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261
6262static PyObject *
6263unicode_zfill(PyUnicodeObject *self, PyObject *args)
6264{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006265 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 PyUnicodeObject *u;
6267
Martin v. Löwis18e16552006-02-15 17:27:45 +00006268 Py_ssize_t width;
6269 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 return NULL;
6271
6272 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006273 if (PyUnicode_CheckExact(self)) {
6274 Py_INCREF(self);
6275 return (PyObject*) self;
6276 }
6277 else
6278 return PyUnicode_FromUnicode(
6279 PyUnicode_AS_UNICODE(self),
6280 PyUnicode_GET_SIZE(self)
6281 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 }
6283
6284 fill = width - self->length;
6285
6286 u = pad(self, fill, 0, '0');
6287
Walter Dörwald068325e2002-04-15 13:36:47 +00006288 if (u == NULL)
6289 return NULL;
6290
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 if (u->str[fill] == '+' || u->str[fill] == '-') {
6292 /* move sign to beginning of string */
6293 u->str[0] = u->str[fill];
6294 u->str[fill] = '0';
6295 }
6296
6297 return (PyObject*) u;
6298}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299
6300#if 0
6301static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006302unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 return PyInt_FromLong(unicode_freelist_size);
6305}
6306#endif
6307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006308PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006309"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006311Return True if S starts with the specified prefix, False otherwise.\n\
6312With optional start, test S beginning at that position.\n\
6313With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314
6315static PyObject *
6316unicode_startswith(PyUnicodeObject *self,
6317 PyObject *args)
6318{
6319 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006320 Py_ssize_t start = 0;
6321 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 PyObject *result;
6323
Guido van Rossumb8872e62000-05-09 14:14:27 +00006324 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6325 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326 return NULL;
6327 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6328 (PyObject *)substring);
6329 if (substring == NULL)
6330 return NULL;
6331
Guido van Rossum77f6a652002-04-03 22:41:51 +00006332 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333
6334 Py_DECREF(substring);
6335 return result;
6336}
6337
6338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006339PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006340"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006342Return True if S ends with the specified suffix, False otherwise.\n\
6343With optional start, test S beginning at that position.\n\
6344With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345
6346static PyObject *
6347unicode_endswith(PyUnicodeObject *self,
6348 PyObject *args)
6349{
6350 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006351 Py_ssize_t start = 0;
6352 Py_ssize_t end = INT_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 PyObject *result;
6354
Guido van Rossumb8872e62000-05-09 14:14:27 +00006355 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6356 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 return NULL;
6358 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6359 (PyObject *)substring);
6360 if (substring == NULL)
6361 return NULL;
6362
Guido van Rossum77f6a652002-04-03 22:41:51 +00006363 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364
6365 Py_DECREF(substring);
6366 return result;
6367}
6368
6369
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006370
6371static PyObject *
6372unicode_getnewargs(PyUnicodeObject *v)
6373{
6374 return Py_BuildValue("(u#)", v->str, v->length);
6375}
6376
6377
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378static PyMethodDef unicode_methods[] = {
6379
6380 /* Order is according to common usage: often used methods should
6381 appear first, since lookup is done sequentially. */
6382
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006383 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6384 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6385 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006386 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006387 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6388 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6389 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6390 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6391 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6392 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6393 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6394 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6395 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6396 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006397 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006398 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006399/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6400 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6401 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6402 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006403 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006404 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006405 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006406 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6407 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6408 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6409 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6410 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6411 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6412 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6413 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6414 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6415 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6416 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6417 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6418 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6419 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006420 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006421#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006422 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423#endif
6424
6425#if 0
6426 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006427 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428#endif
6429
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006430 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 {NULL, NULL}
6432};
6433
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006434static PyObject *
6435unicode_mod(PyObject *v, PyObject *w)
6436{
6437 if (!PyUnicode_Check(v)) {
6438 Py_INCREF(Py_NotImplemented);
6439 return Py_NotImplemented;
6440 }
6441 return PyUnicode_Format(v, w);
6442}
6443
6444static PyNumberMethods unicode_as_number = {
6445 0, /*nb_add*/
6446 0, /*nb_subtract*/
6447 0, /*nb_multiply*/
6448 0, /*nb_divide*/
6449 unicode_mod, /*nb_remainder*/
6450};
6451
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006453 (lenfunc) unicode_length, /* sq_length */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 (binaryfunc) PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006455 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6456 (ssizeargfunc) unicode_getitem, /* sq_item */
6457 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 0, /* sq_ass_item */
6459 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006460 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461};
6462
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006463#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6464
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006465static PyObject*
6466unicode_subscript(PyUnicodeObject* self, PyObject* item)
6467{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006468 PyNumberMethods *nb = item->ob_type->tp_as_number;
6469 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6470 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006471 if (i == -1 && PyErr_Occurred())
6472 return NULL;
6473 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006474 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006475 return unicode_getitem(self, i);
6476 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006477 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006478 Py_UNICODE* source_buf;
6479 Py_UNICODE* result_buf;
6480 PyObject* result;
6481
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006482 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006483 &start, &stop, &step, &slicelength) < 0) {
6484 return NULL;
6485 }
6486
6487 if (slicelength <= 0) {
6488 return PyUnicode_FromUnicode(NULL, 0);
6489 } else {
6490 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6491 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006492
6493 if (result_buf == NULL)
6494 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006495
6496 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6497 result_buf[i] = source_buf[cur];
6498 }
Tim Petersced69f82003-09-16 20:30:58 +00006499
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006500 result = PyUnicode_FromUnicode(result_buf, slicelength);
6501 PyMem_FREE(result_buf);
6502 return result;
6503 }
6504 } else {
6505 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6506 return NULL;
6507 }
6508}
6509
6510static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006511 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006512 (binaryfunc)unicode_subscript, /* mp_subscript */
6513 (objobjargproc)0, /* mp_ass_subscript */
6514};
6515
Martin v. Löwis18e16552006-02-15 17:27:45 +00006516static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006518 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 const void **ptr)
6520{
6521 if (index != 0) {
6522 PyErr_SetString(PyExc_SystemError,
6523 "accessing non-existent unicode segment");
6524 return -1;
6525 }
6526 *ptr = (void *) self->str;
6527 return PyUnicode_GET_DATA_SIZE(self);
6528}
6529
Martin v. Löwis18e16552006-02-15 17:27:45 +00006530static Py_ssize_t
6531unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 const void **ptr)
6533{
6534 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006535 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 return -1;
6537}
6538
6539static int
6540unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006541 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542{
6543 if (lenp)
6544 *lenp = PyUnicode_GET_DATA_SIZE(self);
6545 return 1;
6546}
6547
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006548static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006550 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 const void **ptr)
6552{
6553 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006554
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 if (index != 0) {
6556 PyErr_SetString(PyExc_SystemError,
6557 "accessing non-existent unicode segment");
6558 return -1;
6559 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006560 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 if (str == NULL)
6562 return -1;
6563 *ptr = (void *) PyString_AS_STRING(str);
6564 return PyString_GET_SIZE(str);
6565}
6566
6567/* Helpers for PyUnicode_Format() */
6568
6569static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006570getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006572 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 if (argidx < arglen) {
6574 (*p_argidx)++;
6575 if (arglen < 0)
6576 return args;
6577 else
6578 return PyTuple_GetItem(args, argidx);
6579 }
6580 PyErr_SetString(PyExc_TypeError,
6581 "not enough arguments for format string");
6582 return NULL;
6583}
6584
6585#define F_LJUST (1<<0)
6586#define F_SIGN (1<<1)
6587#define F_BLANK (1<<2)
6588#define F_ALT (1<<3)
6589#define F_ZERO (1<<4)
6590
Martin v. Löwis18e16552006-02-15 17:27:45 +00006591static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006592strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006594 register Py_ssize_t i;
6595 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 for (i = len - 1; i >= 0; i--)
6597 buffer[i] = (Py_UNICODE) charbuffer[i];
6598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 return len;
6600}
6601
Neal Norwitzfc76d632006-01-10 06:03:13 +00006602static int
6603doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6604{
Tim Peters15231542006-02-16 01:08:01 +00006605 Py_ssize_t result;
6606
Neal Norwitzfc76d632006-01-10 06:03:13 +00006607 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006608 result = strtounicode(buffer, (char *)buffer);
6609 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006610}
6611
6612static int
6613longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6614{
Tim Peters15231542006-02-16 01:08:01 +00006615 Py_ssize_t result;
6616
Neal Norwitzfc76d632006-01-10 06:03:13 +00006617 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006618 result = strtounicode(buffer, (char *)buffer);
6619 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006620}
6621
Guido van Rossum078151d2002-08-11 04:24:12 +00006622/* XXX To save some code duplication, formatfloat/long/int could have been
6623 shared with stringobject.c, converting from 8-bit to Unicode after the
6624 formatting is done. */
6625
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626static int
6627formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006628 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 int flags,
6630 int prec,
6631 int type,
6632 PyObject *v)
6633{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006634 /* fmt = '%#.' + `prec` + `type`
6635 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 char fmt[20];
6637 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006638
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 x = PyFloat_AsDouble(v);
6640 if (x == -1.0 && PyErr_Occurred())
6641 return -1;
6642 if (prec < 0)
6643 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6645 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006646 /* Worst case length calc to ensure no buffer overrun:
6647
6648 'g' formats:
6649 fmt = %#.<prec>g
6650 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6651 for any double rep.)
6652 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6653
6654 'f' formats:
6655 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6656 len = 1 + 50 + 1 + prec = 52 + prec
6657
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006658 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006659 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006660
6661 */
6662 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6663 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006664 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006665 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006666 return -1;
6667 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006668 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6669 (flags&F_ALT) ? "#" : "",
6670 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006671 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672}
6673
Tim Peters38fd5b62000-09-21 05:43:11 +00006674static PyObject*
6675formatlong(PyObject *val, int flags, int prec, int type)
6676{
6677 char *buf;
6678 int i, len;
6679 PyObject *str; /* temporary string object. */
6680 PyUnicodeObject *result;
6681
6682 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6683 if (!str)
6684 return NULL;
6685 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006686 if (!result) {
6687 Py_DECREF(str);
6688 return NULL;
6689 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006690 for (i = 0; i < len; i++)
6691 result->str[i] = buf[i];
6692 result->str[len] = 0;
6693 Py_DECREF(str);
6694 return (PyObject*)result;
6695}
6696
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697static int
6698formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006699 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 int flags,
6701 int prec,
6702 int type,
6703 PyObject *v)
6704{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006705 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006706 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6707 * + 1 + 1
6708 * = 24
6709 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006710 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006711 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 long x;
6713
6714 x = PyInt_AsLong(v);
6715 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006716 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006717 if (x < 0 && type == 'u') {
6718 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006719 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006720 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6721 sign = "-";
6722 else
6723 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006725 prec = 1;
6726
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006727 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6728 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006729 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006730 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006731 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006732 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006733 return -1;
6734 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006735
6736 if ((flags & F_ALT) &&
6737 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006738 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006739 * of issues that cause pain:
6740 * - when 0 is being converted, the C standard leaves off
6741 * the '0x' or '0X', which is inconsistent with other
6742 * %#x/%#X conversions and inconsistent with Python's
6743 * hex() function
6744 * - there are platforms that violate the standard and
6745 * convert 0 with the '0x' or '0X'
6746 * (Metrowerks, Compaq Tru64)
6747 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006748 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006749 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006750 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006751 * We can achieve the desired consistency by inserting our
6752 * own '0x' or '0X' prefix, and substituting %x/%X in place
6753 * of %#x/%#X.
6754 *
6755 * Note that this is the same approach as used in
6756 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006757 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006758 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6759 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006760 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006761 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006762 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6763 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006764 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006765 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006766 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006767 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006768 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006769 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770}
6771
6772static int
6773formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006774 size_t buflen,
6775 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006777 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006778 if (PyUnicode_Check(v)) {
6779 if (PyUnicode_GET_SIZE(v) != 1)
6780 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006784 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006785 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006786 goto onError;
6787 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789
6790 else {
6791 /* Integer input truncated to a character */
6792 long x;
6793 x = PyInt_AsLong(v);
6794 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006795 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006796#ifdef Py_UNICODE_WIDE
6797 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006798 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006799 "%c arg not in range(0x110000) "
6800 "(wide Python build)");
6801 return -1;
6802 }
6803#else
6804 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006805 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006806 "%c arg not in range(0x10000) "
6807 "(narrow Python build)");
6808 return -1;
6809 }
6810#endif
6811 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 }
6813 buf[1] = '\0';
6814 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006815
6816 onError:
6817 PyErr_SetString(PyExc_TypeError,
6818 "%c requires int or char");
6819 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820}
6821
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006822/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6823
6824 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6825 chars are formatted. XXX This is a magic number. Each formatting
6826 routine does bounds checking to ensure no overflow, but a better
6827 solution may be to malloc a buffer of appropriate size for each
6828 format. For now, the current solution is sufficient.
6829*/
6830#define FORMATBUFLEN (size_t)120
6831
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832PyObject *PyUnicode_Format(PyObject *format,
6833 PyObject *args)
6834{
6835 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006836 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 int args_owned = 0;
6838 PyUnicodeObject *result = NULL;
6839 PyObject *dict = NULL;
6840 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006841
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 if (format == NULL || args == NULL) {
6843 PyErr_BadInternalCall();
6844 return NULL;
6845 }
6846 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006847 if (uformat == NULL)
6848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 fmt = PyUnicode_AS_UNICODE(uformat);
6850 fmtcnt = PyUnicode_GET_SIZE(uformat);
6851
6852 reslen = rescnt = fmtcnt + 100;
6853 result = _PyUnicode_New(reslen);
6854 if (result == NULL)
6855 goto onError;
6856 res = PyUnicode_AS_UNICODE(result);
6857
6858 if (PyTuple_Check(args)) {
6859 arglen = PyTuple_Size(args);
6860 argidx = 0;
6861 }
6862 else {
6863 arglen = -1;
6864 argidx = -2;
6865 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006866 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6867 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 dict = args;
6869
6870 while (--fmtcnt >= 0) {
6871 if (*fmt != '%') {
6872 if (--rescnt < 0) {
6873 rescnt = fmtcnt + 100;
6874 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006875 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006876 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6878 --rescnt;
6879 }
6880 *res++ = *fmt++;
6881 }
6882 else {
6883 /* Got a format specifier */
6884 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006885 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 Py_UNICODE c = '\0';
6888 Py_UNICODE fill;
6889 PyObject *v = NULL;
6890 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006891 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006893 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006894 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895
6896 fmt++;
6897 if (*fmt == '(') {
6898 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 PyObject *key;
6901 int pcount = 1;
6902
6903 if (dict == NULL) {
6904 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006905 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 goto onError;
6907 }
6908 ++fmt;
6909 --fmtcnt;
6910 keystart = fmt;
6911 /* Skip over balanced parentheses */
6912 while (pcount > 0 && --fmtcnt >= 0) {
6913 if (*fmt == ')')
6914 --pcount;
6915 else if (*fmt == '(')
6916 ++pcount;
6917 fmt++;
6918 }
6919 keylen = fmt - keystart - 1;
6920 if (fmtcnt < 0 || pcount > 0) {
6921 PyErr_SetString(PyExc_ValueError,
6922 "incomplete format key");
6923 goto onError;
6924 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006925#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006926 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 then looked up since Python uses strings to hold
6928 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006929 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 key = PyUnicode_EncodeUTF8(keystart,
6931 keylen,
6932 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006933#else
6934 key = PyUnicode_FromUnicode(keystart, keylen);
6935#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 if (key == NULL)
6937 goto onError;
6938 if (args_owned) {
6939 Py_DECREF(args);
6940 args_owned = 0;
6941 }
6942 args = PyObject_GetItem(dict, key);
6943 Py_DECREF(key);
6944 if (args == NULL) {
6945 goto onError;
6946 }
6947 args_owned = 1;
6948 arglen = -1;
6949 argidx = -2;
6950 }
6951 while (--fmtcnt >= 0) {
6952 switch (c = *fmt++) {
6953 case '-': flags |= F_LJUST; continue;
6954 case '+': flags |= F_SIGN; continue;
6955 case ' ': flags |= F_BLANK; continue;
6956 case '#': flags |= F_ALT; continue;
6957 case '0': flags |= F_ZERO; continue;
6958 }
6959 break;
6960 }
6961 if (c == '*') {
6962 v = getnextarg(args, arglen, &argidx);
6963 if (v == NULL)
6964 goto onError;
6965 if (!PyInt_Check(v)) {
6966 PyErr_SetString(PyExc_TypeError,
6967 "* wants int");
6968 goto onError;
6969 }
6970 width = PyInt_AsLong(v);
6971 if (width < 0) {
6972 flags |= F_LJUST;
6973 width = -width;
6974 }
6975 if (--fmtcnt >= 0)
6976 c = *fmt++;
6977 }
6978 else if (c >= '0' && c <= '9') {
6979 width = c - '0';
6980 while (--fmtcnt >= 0) {
6981 c = *fmt++;
6982 if (c < '0' || c > '9')
6983 break;
6984 if ((width*10) / 10 != width) {
6985 PyErr_SetString(PyExc_ValueError,
6986 "width too big");
6987 goto onError;
6988 }
6989 width = width*10 + (c - '0');
6990 }
6991 }
6992 if (c == '.') {
6993 prec = 0;
6994 if (--fmtcnt >= 0)
6995 c = *fmt++;
6996 if (c == '*') {
6997 v = getnextarg(args, arglen, &argidx);
6998 if (v == NULL)
6999 goto onError;
7000 if (!PyInt_Check(v)) {
7001 PyErr_SetString(PyExc_TypeError,
7002 "* wants int");
7003 goto onError;
7004 }
7005 prec = PyInt_AsLong(v);
7006 if (prec < 0)
7007 prec = 0;
7008 if (--fmtcnt >= 0)
7009 c = *fmt++;
7010 }
7011 else if (c >= '0' && c <= '9') {
7012 prec = c - '0';
7013 while (--fmtcnt >= 0) {
7014 c = Py_CHARMASK(*fmt++);
7015 if (c < '0' || c > '9')
7016 break;
7017 if ((prec*10) / 10 != prec) {
7018 PyErr_SetString(PyExc_ValueError,
7019 "prec too big");
7020 goto onError;
7021 }
7022 prec = prec*10 + (c - '0');
7023 }
7024 }
7025 } /* prec */
7026 if (fmtcnt >= 0) {
7027 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 if (--fmtcnt >= 0)
7029 c = *fmt++;
7030 }
7031 }
7032 if (fmtcnt < 0) {
7033 PyErr_SetString(PyExc_ValueError,
7034 "incomplete format");
7035 goto onError;
7036 }
7037 if (c != '%') {
7038 v = getnextarg(args, arglen, &argidx);
7039 if (v == NULL)
7040 goto onError;
7041 }
7042 sign = 0;
7043 fill = ' ';
7044 switch (c) {
7045
7046 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007047 pbuf = formatbuf;
7048 /* presume that buffer length is at least 1 */
7049 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 len = 1;
7051 break;
7052
7053 case 's':
7054 case 'r':
7055 if (PyUnicode_Check(v) && c == 's') {
7056 temp = v;
7057 Py_INCREF(temp);
7058 }
7059 else {
7060 PyObject *unicode;
7061 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007062 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 else
7064 temp = PyObject_Repr(v);
7065 if (temp == NULL)
7066 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007067 if (PyUnicode_Check(temp))
7068 /* nothing to do */;
7069 else if (PyString_Check(temp)) {
7070 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00007071 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00007073 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 "strict");
7075 Py_DECREF(temp);
7076 temp = unicode;
7077 if (temp == NULL)
7078 goto onError;
7079 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007080 else {
7081 Py_DECREF(temp);
7082 PyErr_SetString(PyExc_TypeError,
7083 "%s argument has non-string str()");
7084 goto onError;
7085 }
7086 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007087 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 len = PyUnicode_GET_SIZE(temp);
7089 if (prec >= 0 && len > prec)
7090 len = prec;
7091 break;
7092
7093 case 'i':
7094 case 'd':
7095 case 'u':
7096 case 'o':
7097 case 'x':
7098 case 'X':
7099 if (c == 'i')
7100 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007101 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007102 temp = formatlong(v, flags, prec, c);
7103 if (!temp)
7104 goto onError;
7105 pbuf = PyUnicode_AS_UNICODE(temp);
7106 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007107 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007109 else {
7110 pbuf = formatbuf;
7111 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7112 flags, prec, c, v);
7113 if (len < 0)
7114 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007115 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007116 }
7117 if (flags & F_ZERO)
7118 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 break;
7120
7121 case 'e':
7122 case 'E':
7123 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007124 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 case 'g':
7126 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007127 if (c == 'F')
7128 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007129 pbuf = formatbuf;
7130 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7131 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 if (len < 0)
7133 goto onError;
7134 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007135 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 fill = '0';
7137 break;
7138
7139 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007140 pbuf = formatbuf;
7141 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 if (len < 0)
7143 goto onError;
7144 break;
7145
7146 default:
7147 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007148 "unsupported format character '%c' (0x%x) "
7149 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007150 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007151 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007152 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 goto onError;
7154 }
7155 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007156 if (*pbuf == '-' || *pbuf == '+') {
7157 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 len--;
7159 }
7160 else if (flags & F_SIGN)
7161 sign = '+';
7162 else if (flags & F_BLANK)
7163 sign = ' ';
7164 else
7165 sign = 0;
7166 }
7167 if (width < len)
7168 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007169 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 reslen -= rescnt;
7171 rescnt = width + fmtcnt + 100;
7172 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007173 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007174 Py_XDECREF(temp);
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007175 Py_DECREF(result);
7176 return PyErr_NoMemory();
7177 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007178 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 return NULL;
7180 res = PyUnicode_AS_UNICODE(result)
7181 + reslen - rescnt;
7182 }
7183 if (sign) {
7184 if (fill != ' ')
7185 *res++ = sign;
7186 rescnt--;
7187 if (width > len)
7188 width--;
7189 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007190 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7191 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007192 assert(pbuf[1] == c);
7193 if (fill != ' ') {
7194 *res++ = *pbuf++;
7195 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007196 }
Tim Petersfff53252001-04-12 18:38:48 +00007197 rescnt -= 2;
7198 width -= 2;
7199 if (width < 0)
7200 width = 0;
7201 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 if (width > len && !(flags & F_LJUST)) {
7204 do {
7205 --rescnt;
7206 *res++ = fill;
7207 } while (--width > len);
7208 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007209 if (fill == ' ') {
7210 if (sign)
7211 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007212 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007213 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007214 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007215 *res++ = *pbuf++;
7216 *res++ = *pbuf++;
7217 }
7218 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007219 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 res += len;
7221 rescnt -= len;
7222 while (--width >= len) {
7223 --rescnt;
7224 *res++ = ' ';
7225 }
7226 if (dict && (argidx < arglen) && c != '%') {
7227 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007228 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 goto onError;
7230 }
7231 Py_XDECREF(temp);
7232 } /* '%' */
7233 } /* until end */
7234 if (argidx < arglen && !dict) {
7235 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007236 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 goto onError;
7238 }
7239
7240 if (args_owned) {
7241 Py_DECREF(args);
7242 }
7243 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007244 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007245 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 return (PyObject *)result;
7247
7248 onError:
7249 Py_XDECREF(result);
7250 Py_DECREF(uformat);
7251 if (args_owned) {
7252 Py_DECREF(args);
7253 }
7254 return NULL;
7255}
7256
7257static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 (readbufferproc) unicode_buffer_getreadbuf,
7259 (writebufferproc) unicode_buffer_getwritebuf,
7260 (segcountproc) unicode_buffer_getsegcount,
7261 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262};
7263
Jeremy Hylton938ace62002-07-17 16:30:39 +00007264static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007265unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7266
Tim Peters6d6c1a32001-08-02 04:15:00 +00007267static PyObject *
7268unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7269{
7270 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007271 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007272 char *encoding = NULL;
7273 char *errors = NULL;
7274
Guido van Rossume023fe02001-08-30 03:12:59 +00007275 if (type != &PyUnicode_Type)
7276 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007277 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7278 kwlist, &x, &encoding, &errors))
7279 return NULL;
7280 if (x == NULL)
7281 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007282 if (encoding == NULL && errors == NULL)
7283 return PyObject_Unicode(x);
7284 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007285 return PyUnicode_FromEncodedObject(x, encoding, errors);
7286}
7287
Guido van Rossume023fe02001-08-30 03:12:59 +00007288static PyObject *
7289unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7290{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007291 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007292 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007293
7294 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7295 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7296 if (tmp == NULL)
7297 return NULL;
7298 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007299 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007300 if (pnew == NULL) {
7301 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007302 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007303 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007304 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7305 if (pnew->str == NULL) {
7306 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007307 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007308 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007309 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007310 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007311 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7312 pnew->length = n;
7313 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007314 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007315 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007316}
7317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007318PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007319"unicode(string [, encoding[, errors]]) -> object\n\
7320\n\
7321Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007322encoding defaults to the current default string encoding.\n\
7323errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007324
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325PyTypeObject PyUnicode_Type = {
7326 PyObject_HEAD_INIT(&PyType_Type)
7327 0, /* ob_size */
7328 "unicode", /* tp_name */
7329 sizeof(PyUnicodeObject), /* tp_size */
7330 0, /* tp_itemsize */
7331 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007332 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007334 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 0, /* tp_setattr */
7336 (cmpfunc) unicode_compare, /* tp_compare */
7337 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007338 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007340 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 (hashfunc) unicode_hash, /* tp_hash*/
7342 0, /* tp_call*/
7343 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007344 PyObject_GenericGetAttr, /* tp_getattro */
7345 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007347 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7348 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007349 unicode_doc, /* tp_doc */
7350 0, /* tp_traverse */
7351 0, /* tp_clear */
7352 0, /* tp_richcompare */
7353 0, /* tp_weaklistoffset */
7354 0, /* tp_iter */
7355 0, /* tp_iternext */
7356 unicode_methods, /* tp_methods */
7357 0, /* tp_members */
7358 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007359 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007360 0, /* tp_dict */
7361 0, /* tp_descr_get */
7362 0, /* tp_descr_set */
7363 0, /* tp_dictoffset */
7364 0, /* tp_init */
7365 0, /* tp_alloc */
7366 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007367 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368};
7369
7370/* Initialize the Unicode implementation */
7371
Thomas Wouters78890102000-07-22 19:25:51 +00007372void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007374 int i;
7375
Fred Drakee4315f52000-05-09 19:53:39 +00007376 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007377 unicode_freelist = NULL;
7378 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007380 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007381 for (i = 0; i < 256; i++)
7382 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007383 if (PyType_Ready(&PyUnicode_Type) < 0)
7384 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385}
7386
7387/* Finalize the Unicode implementation */
7388
7389void
Thomas Wouters78890102000-07-22 19:25:51 +00007390_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007392 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007393 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007395 Py_XDECREF(unicode_empty);
7396 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007397
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007398 for (i = 0; i < 256; i++) {
7399 if (unicode_latin1[i]) {
7400 Py_DECREF(unicode_latin1[i]);
7401 unicode_latin1[i] = NULL;
7402 }
7403 }
7404
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007405 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 PyUnicodeObject *v = u;
7407 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007408 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007409 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007410 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007411 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007413 unicode_freelist = NULL;
7414 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007416
7417/*
7418Local variables:
7419c-basic-offset: 4
7420indent-tabs-mode: nil
7421End:
7422*/