blob: 854310b4720b51341dfb1202310185d76dda9acb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
308 unicode->ob_refcnt = 3;
309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000341 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000355 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000441 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
446 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000447 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
461 while ((*p++ = *u++))
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000462 ;
463 }
464
465 return (PyObject *)unicode;
466}
467
Walter Dörwaldd2034312007-05-18 16:29:38 +0000468PyObject *PyUnicode_FromString(const char *u)
469{
470 size_t size = strlen(u);
471 if (size > PY_SSIZE_T_MAX) {
472 PyErr_SetString(PyExc_OverflowError, "input too long");
473 return NULL;
474 }
475
476 return PyUnicode_FromStringAndSize(u, size);
477}
478
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479#ifdef HAVE_WCHAR_H
480
481PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000482 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000483{
484 PyUnicodeObject *unicode;
485
486 if (w == NULL) {
487 PyErr_BadInternalCall();
488 return NULL;
489 }
490
491 unicode = _PyUnicode_New(size);
492 if (!unicode)
493 return NULL;
494
495 /* Copy the wchar_t data into the new object */
496#ifdef HAVE_USABLE_WCHAR_T
497 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000498#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000499 {
500 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000501 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000502 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000503 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 *u++ = *w++;
505 }
506#endif
507
508 return (PyObject *)unicode;
509}
510
Walter Dörwaldd2034312007-05-18 16:29:38 +0000511#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
512
513PyObject *
514PyUnicode_FromFormatV(const char *format, va_list vargs)
515{
516 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000517 Py_ssize_t callcount = 0;
518 PyObject **callresults = NULL;
519 PyObject **callresult;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000520 Py_ssize_t n = 0;
521 const char* f;
522 Py_UNICODE *s;
523 PyObject *string;
524 /* used by sprintf */
525 char buffer[21];
526 const char *copy;
527
528#ifdef VA_LIST_IS_ARRAY
529 Py_MEMCPY(count, vargs, sizeof(va_list));
530#else
531#ifdef __va_copy
532 __va_copy(count, vargs);
533#else
534 count = vargs;
535#endif
536#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000537 /* step 1: count the number of %S/%R format specifications
538 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
539 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000540 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000541 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000542 ++callcount;
543 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000544 /* step 2: allocate memory for the results of
545 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000546 if (callcount) {
547 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
548 if (!callresults) {
549 PyErr_NoMemory();
550 return NULL;
551 }
552 callresult = callresults;
553 }
554 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000555 for (f = format; *f; f++) {
556 if (*f == '%') {
557 const char* p = f;
558 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
559 ;
560
561 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
562 * they don't affect the amount of space we reserve.
563 */
564 if ((*f == 'l' || *f == 'z') &&
565 (f[1] == 'd' || f[1] == 'u'))
566 ++f;
567
568 switch (*f) {
569 case 'c':
570 (void)va_arg(count, int);
571 /* fall through... */
572 case '%':
573 n++;
574 break;
575 case 'd': case 'u': case 'i': case 'x':
576 (void) va_arg(count, int);
577 /* 20 bytes is enough to hold a 64-bit
578 integer. Decimal takes the most space.
579 This isn't enough for octal. */
580 n += 20;
581 break;
582 case 's':
583 n += strlen(va_arg(count, char*));
584 break;
585 case 'U':
586 {
587 PyObject *obj = va_arg(count, PyObject *);
588 assert(obj && PyUnicode_Check(obj));
589 n += PyUnicode_GET_SIZE(obj);
590 break;
591 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000592 case 'S':
593 {
594 PyObject *obj = va_arg(count, PyObject *);
595 PyObject *str;
596 assert(obj);
597 str = PyObject_Unicode(obj);
598 if (!str)
599 goto fail;
600 n += PyUnicode_GET_SIZE(str);
601 /* Remember the str and switch to the next slot */
602 *callresult++ = str;
603 break;
604 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000605 case 'R':
606 {
607 PyObject *obj = va_arg(count, PyObject *);
608 PyObject *repr;
609 assert(obj);
610 repr = PyObject_Repr(obj);
611 if (!repr)
612 goto fail;
613 n += PyUnicode_GET_SIZE(repr);
614 /* Remember the repr and switch to the next slot */
615 *callresult++ = repr;
616 break;
617 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000618 case 'p':
619 (void) va_arg(count, int);
620 /* maximum 64-bit pointer representation:
621 * 0xffffffffffffffff
622 * so 19 characters is enough.
623 * XXX I count 18 -- what's the extra for?
624 */
625 n += 19;
626 break;
627 default:
628 /* if we stumble upon an unknown
629 formatting code, copy the rest of
630 the format string to the output
631 string. (we cannot just skip the
632 code, since there's no way to know
633 what's in the argument list) */
634 n += strlen(p);
635 goto expand;
636 }
637 } else
638 n++;
639 }
640 expand:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000641 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000642 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000643 we don't have to resize the string.
644 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000645 string = PyUnicode_FromUnicode(NULL, n);
646 if (!string)
647 return NULL;
648
649 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000650 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000651
652 for (f = format; *f; f++) {
653 if (*f == '%') {
654 const char* p = f++;
655 int longflag = 0;
656 int size_tflag = 0;
657 /* parse the width.precision part (we're only
658 interested in the precision value, if any) */
659 n = 0;
660 while (isdigit(Py_CHARMASK(*f)))
661 n = (n*10) + *f++ - '0';
662 if (*f == '.') {
663 f++;
664 n = 0;
665 while (isdigit(Py_CHARMASK(*f)))
666 n = (n*10) + *f++ - '0';
667 }
668 while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
669 f++;
670 /* handle the long flag, but only for %ld and %lu.
671 others can be added when necessary. */
672 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
673 longflag = 1;
674 ++f;
675 }
676 /* handle the size_t flag. */
677 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
678 size_tflag = 1;
679 ++f;
680 }
681
682 switch (*f) {
683 case 'c':
684 *s++ = va_arg(vargs, int);
685 break;
686 case 'd':
687 if (longflag)
688 sprintf(buffer, "%ld", va_arg(vargs, long));
689 else if (size_tflag)
690 sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
691 va_arg(vargs, Py_ssize_t));
692 else
693 sprintf(buffer, "%d", va_arg(vargs, int));
694 appendstring(buffer);
695 break;
696 case 'u':
697 if (longflag)
698 sprintf(buffer, "%lu",
699 va_arg(vargs, unsigned long));
700 else if (size_tflag)
701 sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
702 va_arg(vargs, size_t));
703 else
704 sprintf(buffer, "%u",
705 va_arg(vargs, unsigned int));
706 appendstring(buffer);
707 break;
708 case 'i':
709 sprintf(buffer, "%i", va_arg(vargs, int));
710 appendstring(buffer);
711 break;
712 case 'x':
713 sprintf(buffer, "%x", va_arg(vargs, int));
714 appendstring(buffer);
715 break;
716 case 's':
717 p = va_arg(vargs, char*);
718 appendstring(p);
719 break;
720 case 'U':
721 {
722 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000723 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
724 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
725 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726 break;
727 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000728 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000729 case 'R':
730 {
731 /* unused, since we already have the result */
732 (void) va_arg(vargs, PyObject *);
733 Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(*callresult);
734 Py_ssize_t usize = PyUnicode_GET_SIZE(*callresult);
735 Py_ssize_t upos;
736 for (upos = 0; upos<usize;)
737 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000738 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000739 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000740 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000741 ++callresult;
742 break;
743 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 case 'p':
745 sprintf(buffer, "%p", va_arg(vargs, void*));
746 /* %p is ill-defined: ensure leading 0x. */
747 if (buffer[1] == 'X')
748 buffer[1] = 'x';
749 else if (buffer[1] != 'x') {
750 memmove(buffer+2, buffer, strlen(buffer)+1);
751 buffer[0] = '0';
752 buffer[1] = 'x';
753 }
754 appendstring(buffer);
755 break;
756 case '%':
757 *s++ = '%';
758 break;
759 default:
760 appendstring(p);
761 goto end;
762 }
763 } else
764 *s++ = *f;
765 }
766
767 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000768 if (callresults)
769 PyMem_Free(callresults);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000770 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
771 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000772 fail:
773 if (callresults) {
774 PyObject **callresult2 = callresults;
775 while (callresult2 <= callresult) {
776 Py_DECREF(*callresult2);
777 ++callresult2;
778 }
779 PyMem_Free(callresults);
780 }
781 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782}
783
784#undef appendstring
785
786PyObject *
787PyUnicode_FromFormat(const char *format, ...)
788{
789 PyObject* ret;
790 va_list vargs;
791
792#ifdef HAVE_STDARG_PROTOTYPES
793 va_start(vargs, format);
794#else
795 va_start(vargs);
796#endif
797 ret = PyUnicode_FromFormatV(format, vargs);
798 va_end(vargs);
799 return ret;
800}
801
Martin v. Löwis18e16552006-02-15 17:27:45 +0000802Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
803 wchar_t *w,
804 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000805{
806 if (unicode == NULL) {
807 PyErr_BadInternalCall();
808 return -1;
809 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000810
811 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000813 size = PyUnicode_GET_SIZE(unicode) + 1;
814
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815#ifdef HAVE_USABLE_WCHAR_T
816 memcpy(w, unicode->str, size * sizeof(wchar_t));
817#else
818 {
819 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000820 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000822 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 *w++ = *u++;
824 }
825#endif
826
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000827 if (size > PyUnicode_GET_SIZE(unicode))
828 return PyUnicode_GET_SIZE(unicode);
829 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830 return size;
831}
832
833#endif
834
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000835PyObject *PyUnicode_FromOrdinal(int ordinal)
836{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000837 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000838
839#ifdef Py_UNICODE_WIDE
840 if (ordinal < 0 || ordinal > 0x10ffff) {
841 PyErr_SetString(PyExc_ValueError,
842 "unichr() arg not in range(0x110000) "
843 "(wide Python build)");
844 return NULL;
845 }
846#else
847 if (ordinal < 0 || ordinal > 0xffff) {
848 PyErr_SetString(PyExc_ValueError,
849 "unichr() arg not in range(0x10000) "
850 "(narrow Python build)");
851 return NULL;
852 }
853#endif
854
Hye-Shik Chang40574832004-04-06 07:24:51 +0000855 s[0] = (Py_UNICODE)ordinal;
856 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000857}
858
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859PyObject *PyUnicode_FromObject(register PyObject *obj)
860{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000861 /* XXX Perhaps we should make this API an alias of
862 PyObject_Unicode() instead ?! */
863 if (PyUnicode_CheckExact(obj)) {
864 Py_INCREF(obj);
865 return obj;
866 }
867 if (PyUnicode_Check(obj)) {
868 /* For a Unicode subtype that's not a Unicode object,
869 return a true Unicode object with the same data. */
870 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
871 PyUnicode_GET_SIZE(obj));
872 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000873 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
874}
875
876PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
877 const char *encoding,
878 const char *errors)
879{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000880 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000881 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000882 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000883
Guido van Rossumd57fd912000-03-10 22:53:23 +0000884 if (obj == NULL) {
885 PyErr_BadInternalCall();
886 return NULL;
887 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000888
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000889#if 0
890 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000891 that no encodings is given and then redirect to
892 PyObject_Unicode() which then applies the additional logic for
893 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000894
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000895 NOTE: This API should really only be used for object which
896 represent *encoded* Unicode !
897
898 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000899 if (PyUnicode_Check(obj)) {
900 if (encoding) {
901 PyErr_SetString(PyExc_TypeError,
902 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000903 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000904 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000905 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000906 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000907#else
908 if (PyUnicode_Check(obj)) {
909 PyErr_SetString(PyExc_TypeError,
910 "decoding Unicode is not supported");
911 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000912 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000913#endif
914
915 /* Coerce object */
916 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000917 s = PyString_AS_STRING(obj);
918 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000919 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000920 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
921 /* Overwrite the error message with something more useful in
922 case of a TypeError. */
923 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000924 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000925 "coercing to Unicode: need string or buffer, "
926 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000927 obj->ob_type->tp_name);
928 goto onError;
929 }
Tim Petersced69f82003-09-16 20:30:58 +0000930
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000931 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000932 if (len == 0) {
933 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000934 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 }
Tim Petersced69f82003-09-16 20:30:58 +0000936 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000937 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000938
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000939 return v;
940
941 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000943}
944
945PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000946 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 const char *encoding,
948 const char *errors)
949{
950 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951
952 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000953 encoding = PyUnicode_GetDefaultEncoding();
954
955 /* Shortcuts for common default encodings */
956 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000957 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000958 else if (strcmp(encoding, "latin-1") == 0)
959 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000960#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
961 else if (strcmp(encoding, "mbcs") == 0)
962 return PyUnicode_DecodeMBCS(s, size, errors);
963#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000964 else if (strcmp(encoding, "ascii") == 0)
965 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000966
967 /* Decode via the codec registry */
968 buffer = PyBuffer_FromMemory((void *)s, size);
969 if (buffer == NULL)
970 goto onError;
971 unicode = PyCodec_Decode(buffer, encoding, errors);
972 if (unicode == NULL)
973 goto onError;
974 if (!PyUnicode_Check(unicode)) {
975 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000976 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000977 unicode->ob_type->tp_name);
978 Py_DECREF(unicode);
979 goto onError;
980 }
981 Py_DECREF(buffer);
982 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000983
Guido van Rossumd57fd912000-03-10 22:53:23 +0000984 onError:
985 Py_XDECREF(buffer);
986 return NULL;
987}
988
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000989PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
990 const char *encoding,
991 const char *errors)
992{
993 PyObject *v;
994
995 if (!PyUnicode_Check(unicode)) {
996 PyErr_BadArgument();
997 goto onError;
998 }
999
1000 if (encoding == NULL)
1001 encoding = PyUnicode_GetDefaultEncoding();
1002
1003 /* Decode via the codec registry */
1004 v = PyCodec_Decode(unicode, encoding, errors);
1005 if (v == NULL)
1006 goto onError;
1007 return v;
1008
1009 onError:
1010 return NULL;
1011}
1012
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001014 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 const char *encoding,
1016 const char *errors)
1017{
1018 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001019
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020 unicode = PyUnicode_FromUnicode(s, size);
1021 if (unicode == NULL)
1022 return NULL;
1023 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1024 Py_DECREF(unicode);
1025 return v;
1026}
1027
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001028PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1029 const char *encoding,
1030 const char *errors)
1031{
1032 PyObject *v;
1033
1034 if (!PyUnicode_Check(unicode)) {
1035 PyErr_BadArgument();
1036 goto onError;
1037 }
1038
1039 if (encoding == NULL)
1040 encoding = PyUnicode_GetDefaultEncoding();
1041
1042 /* Encode via the codec registry */
1043 v = PyCodec_Encode(unicode, encoding, errors);
1044 if (v == NULL)
1045 goto onError;
1046 return v;
1047
1048 onError:
1049 return NULL;
1050}
1051
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1053 const char *encoding,
1054 const char *errors)
1055{
1056 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001057
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 if (!PyUnicode_Check(unicode)) {
1059 PyErr_BadArgument();
1060 goto onError;
1061 }
Fred Drakee4315f52000-05-09 19:53:39 +00001062
Tim Petersced69f82003-09-16 20:30:58 +00001063 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001064 encoding = PyUnicode_GetDefaultEncoding();
1065
1066 /* Shortcuts for common default encodings */
1067 if (errors == NULL) {
1068 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001069 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001070 else if (strcmp(encoding, "latin-1") == 0)
1071 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001072#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1073 else if (strcmp(encoding, "mbcs") == 0)
1074 return PyUnicode_AsMBCSString(unicode);
1075#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001076 else if (strcmp(encoding, "ascii") == 0)
1077 return PyUnicode_AsASCIIString(unicode);
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079
1080 /* Encode via the codec registry */
1081 v = PyCodec_Encode(unicode, encoding, errors);
1082 if (v == NULL)
1083 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001084 if (!PyBytes_Check(v)) {
1085 if (PyString_Check(v)) {
1086 /* Old codec, turn it into bytes */
1087 PyObject *b = PyBytes_FromObject(v);
1088 Py_DECREF(v);
1089 return b;
1090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001092 "encoder did not return a bytes object "
1093 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1094 v->ob_type->tp_name,
1095 encoding ? encoding : "NULL",
1096 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 Py_DECREF(v);
1098 goto onError;
1099 }
1100 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001101
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 onError:
1103 return NULL;
1104}
1105
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001106PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1107 const char *errors)
1108{
1109 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001110 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001111 if (v)
1112 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001113 if (errors != NULL)
1114 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1115 if (errors == NULL) {
1116 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1117 PyUnicode_GET_SIZE(unicode),
1118 NULL);
1119 }
1120 else {
1121 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1122 }
1123 if (!b)
1124 return NULL;
1125 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1126 PyBytes_Size(b));
1127 Py_DECREF(b);
1128 if (!errors) {
1129 Py_XINCREF(v);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001130 ((PyUnicodeObject *)unicode)->defenc = v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001131 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001132 return v;
1133}
1134
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1136{
1137 if (!PyUnicode_Check(unicode)) {
1138 PyErr_BadArgument();
1139 goto onError;
1140 }
1141 return PyUnicode_AS_UNICODE(unicode);
1142
1143 onError:
1144 return NULL;
1145}
1146
Martin v. Löwis18e16552006-02-15 17:27:45 +00001147Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148{
1149 if (!PyUnicode_Check(unicode)) {
1150 PyErr_BadArgument();
1151 goto onError;
1152 }
1153 return PyUnicode_GET_SIZE(unicode);
1154
1155 onError:
1156 return -1;
1157}
1158
Thomas Wouters78890102000-07-22 19:25:51 +00001159const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001160{
1161 return unicode_default_encoding;
1162}
1163
1164int PyUnicode_SetDefaultEncoding(const char *encoding)
1165{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001166 if (strcmp(encoding, unicode_default_encoding) != 0) {
1167 PyErr_Format(PyExc_ValueError,
1168 "Can only set default encoding to %s",
1169 unicode_default_encoding);
1170 return -1;
1171 }
Fred Drakee4315f52000-05-09 19:53:39 +00001172 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001173}
1174
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001175/* error handling callback helper:
1176 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001177 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001178 and adjust various state variables.
1179 return 0 on success, -1 on error
1180*/
1181
1182static
1183int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1184 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001185 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1186 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001187{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001188 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001189
1190 PyObject *restuple = NULL;
1191 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001192 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1193 Py_ssize_t requiredsize;
1194 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001195 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001196 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001197 int res = -1;
1198
1199 if (*errorHandler == NULL) {
1200 *errorHandler = PyCodec_LookupError(errors);
1201 if (*errorHandler == NULL)
1202 goto onError;
1203 }
1204
1205 if (*exceptionObject == NULL) {
1206 *exceptionObject = PyUnicodeDecodeError_Create(
1207 encoding, input, insize, *startinpos, *endinpos, reason);
1208 if (*exceptionObject == NULL)
1209 goto onError;
1210 }
1211 else {
1212 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1213 goto onError;
1214 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1215 goto onError;
1216 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1217 goto onError;
1218 }
1219
1220 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1221 if (restuple == NULL)
1222 goto onError;
1223 if (!PyTuple_Check(restuple)) {
1224 PyErr_Format(PyExc_TypeError, &argparse[4]);
1225 goto onError;
1226 }
1227 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1228 goto onError;
1229 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001230 newpos = insize+newpos;
1231 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001232 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001233 goto onError;
1234 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001235
1236 /* need more space? (at least enough for what we
1237 have+the replacement+the rest of the string (starting
1238 at the new input position), so we won't have to check space
1239 when there are no errors in the rest of the string) */
1240 repptr = PyUnicode_AS_UNICODE(repunicode);
1241 repsize = PyUnicode_GET_SIZE(repunicode);
1242 requiredsize = *outpos + repsize + insize-newpos;
1243 if (requiredsize > outsize) {
1244 if (requiredsize<2*outsize)
1245 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001246 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001247 goto onError;
1248 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1249 }
1250 *endinpos = newpos;
1251 *inptr = input + newpos;
1252 Py_UNICODE_COPY(*outptr, repptr, repsize);
1253 *outptr += repsize;
1254 *outpos += repsize;
1255 /* we made it! */
1256 res = 0;
1257
1258 onError:
1259 Py_XDECREF(restuple);
1260 return res;
1261}
1262
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001263/* --- UTF-7 Codec -------------------------------------------------------- */
1264
1265/* see RFC2152 for details */
1266
Tim Petersced69f82003-09-16 20:30:58 +00001267static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001268char utf7_special[128] = {
1269 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1270 encoded:
1271 0 - not special
1272 1 - special
1273 2 - whitespace (optional)
1274 3 - RFC2152 Set O (optional) */
1275 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1276 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1277 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1279 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1280 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1281 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1282 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1283
1284};
1285
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001286/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1287 warnings about the comparison always being false; since
1288 utf7_special[0] is 1, we can safely make that one comparison
1289 true */
1290
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001291#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001292 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001293 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001294 (encodeO && (utf7_special[(c)] == 3)))
1295
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001296#define B64(n) \
1297 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1298#define B64CHAR(c) \
1299 (isalnum(c) || (c) == '+' || (c) == '/')
1300#define UB64(c) \
1301 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1302 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001303
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001304#define ENCODE(out, ch, bits) \
1305 while (bits >= 6) { \
1306 *out++ = B64(ch >> (bits-6)); \
1307 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001308 }
1309
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001310#define DECODE(out, ch, bits, surrogate) \
1311 while (bits >= 16) { \
1312 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1313 bits -= 16; \
1314 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001315 /* We have already generated an error for the high surrogate \
1316 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001317 surrogate = 0; \
1318 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001319 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001320 it in a 16-bit character */ \
1321 surrogate = 1; \
1322 errmsg = "code pairs are not supported"; \
1323 goto utf7Error; \
1324 } else { \
1325 *out++ = outCh; \
1326 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001327 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001328
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001329PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001330 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001331 const char *errors)
1332{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001333 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001334 Py_ssize_t startinpos;
1335 Py_ssize_t endinpos;
1336 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001337 const char *e;
1338 PyUnicodeObject *unicode;
1339 Py_UNICODE *p;
1340 const char *errmsg = "";
1341 int inShift = 0;
1342 unsigned int bitsleft = 0;
1343 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 int surrogate = 0;
1345 PyObject *errorHandler = NULL;
1346 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001347
1348 unicode = _PyUnicode_New(size);
1349 if (!unicode)
1350 return NULL;
1351 if (size == 0)
1352 return (PyObject *)unicode;
1353
1354 p = unicode->str;
1355 e = s + size;
1356
1357 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001358 Py_UNICODE ch;
1359 restart:
1360 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001361
1362 if (inShift) {
1363 if ((ch == '-') || !B64CHAR(ch)) {
1364 inShift = 0;
1365 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001366
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001367 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1368 if (bitsleft >= 6) {
1369 /* The shift sequence has a partial character in it. If
1370 bitsleft < 6 then we could just classify it as padding
1371 but that is not the case here */
1372
1373 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001374 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001375 }
1376 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001377 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001378 here so indicate the potential of a misencoded character. */
1379
1380 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1381 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1382 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001383 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001384 }
1385
1386 if (ch == '-') {
1387 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001388 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001389 inShift = 1;
1390 }
1391 } else if (SPECIAL(ch,0,0)) {
1392 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001393 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001394 } else {
1395 *p++ = ch;
1396 }
1397 } else {
1398 charsleft = (charsleft << 6) | UB64(ch);
1399 bitsleft += 6;
1400 s++;
1401 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1402 }
1403 }
1404 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001405 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001406 s++;
1407 if (s < e && *s == '-') {
1408 s++;
1409 *p++ = '+';
1410 } else
1411 {
1412 inShift = 1;
1413 bitsleft = 0;
1414 }
1415 }
1416 else if (SPECIAL(ch,0,0)) {
1417 errmsg = "unexpected special character";
1418 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001419 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001420 }
1421 else {
1422 *p++ = ch;
1423 s++;
1424 }
1425 continue;
1426 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001427 outpos = p-PyUnicode_AS_UNICODE(unicode);
1428 endinpos = s-starts;
1429 if (unicode_decode_call_errorhandler(
1430 errors, &errorHandler,
1431 "utf7", errmsg,
1432 starts, size, &startinpos, &endinpos, &exc, &s,
1433 (PyObject **)&unicode, &outpos, &p))
1434 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001435 }
1436
1437 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001438 outpos = p-PyUnicode_AS_UNICODE(unicode);
1439 endinpos = size;
1440 if (unicode_decode_call_errorhandler(
1441 errors, &errorHandler,
1442 "utf7", "unterminated shift sequence",
1443 starts, size, &startinpos, &endinpos, &exc, &s,
1444 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 if (s < e)
1447 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448 }
1449
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001450 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001451 goto onError;
1452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001453 Py_XDECREF(errorHandler);
1454 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001455 return (PyObject *)unicode;
1456
1457onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001458 Py_XDECREF(errorHandler);
1459 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001460 Py_DECREF(unicode);
1461 return NULL;
1462}
1463
1464
1465PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001466 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467 int encodeSetO,
1468 int encodeWhiteSpace,
1469 const char *errors)
1470{
1471 PyObject *v;
1472 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001473 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001474 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001475 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476 unsigned int bitsleft = 0;
1477 unsigned long charsleft = 0;
1478 char * out;
1479 char * start;
1480
1481 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001482 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483
Walter Dörwald51ab4142007-05-05 14:43:36 +00001484 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001485 if (v == NULL)
1486 return NULL;
1487
Walter Dörwald51ab4142007-05-05 14:43:36 +00001488 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001489 for (;i < size; ++i) {
1490 Py_UNICODE ch = s[i];
1491
1492 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001493 if (ch == '+') {
1494 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001495 *out++ = '-';
1496 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1497 charsleft = ch;
1498 bitsleft = 16;
1499 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001500 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001501 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001502 } else {
1503 *out++ = (char) ch;
1504 }
1505 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001506 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1507 *out++ = B64(charsleft << (6-bitsleft));
1508 charsleft = 0;
1509 bitsleft = 0;
1510 /* Characters not in the BASE64 set implicitly unshift the sequence
1511 so no '-' is required, except if the character is itself a '-' */
1512 if (B64CHAR(ch) || ch == '-') {
1513 *out++ = '-';
1514 }
1515 inShift = 0;
1516 *out++ = (char) ch;
1517 } else {
1518 bitsleft += 16;
1519 charsleft = (charsleft << 16) | ch;
1520 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1521
1522 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001523 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001524 or '-' then the shift sequence will be terminated implicitly and we
1525 don't have to insert a '-'. */
1526
1527 if (bitsleft == 0) {
1528 if (i + 1 < size) {
1529 Py_UNICODE ch2 = s[i+1];
1530
1531 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001532
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533 } else if (B64CHAR(ch2) || ch2 == '-') {
1534 *out++ = '-';
1535 inShift = 0;
1536 } else {
1537 inShift = 0;
1538 }
1539
1540 }
1541 else {
1542 *out++ = '-';
1543 inShift = 0;
1544 }
1545 }
Tim Petersced69f82003-09-16 20:30:58 +00001546 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001548 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001549 if (bitsleft) {
1550 *out++= B64(charsleft << (6-bitsleft) );
1551 *out++ = '-';
1552 }
1553
Walter Dörwald51ab4142007-05-05 14:43:36 +00001554 if (PyBytes_Resize(v, out - start)) {
1555 Py_DECREF(v);
1556 return NULL;
1557 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558 return v;
1559}
1560
1561#undef SPECIAL
1562#undef B64
1563#undef B64CHAR
1564#undef UB64
1565#undef ENCODE
1566#undef DECODE
1567
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568/* --- UTF-8 Codec -------------------------------------------------------- */
1569
Tim Petersced69f82003-09-16 20:30:58 +00001570static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571char utf8_code_length[256] = {
1572 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1573 illegal prefix. see RFC 2279 for details */
1574 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1575 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1576 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1577 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1578 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1579 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1580 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1581 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1582 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1583 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1584 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1585 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1586 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1587 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1588 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1589 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1590};
1591
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001593 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 const char *errors)
1595{
Walter Dörwald69652032004-09-07 20:24:22 +00001596 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1597}
1598
1599PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001600 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001601 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001602 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001603{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001604 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001606 Py_ssize_t startinpos;
1607 Py_ssize_t endinpos;
1608 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 const char *e;
1610 PyUnicodeObject *unicode;
1611 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001612 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 PyObject *errorHandler = NULL;
1614 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615
1616 /* Note: size will always be longer than the resulting Unicode
1617 character count */
1618 unicode = _PyUnicode_New(size);
1619 if (!unicode)
1620 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001621 if (size == 0) {
1622 if (consumed)
1623 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626
1627 /* Unpack UTF-8 encoded data */
1628 p = unicode->str;
1629 e = s + size;
1630
1631 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001632 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001633
1634 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001635 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636 s++;
1637 continue;
1638 }
1639
1640 n = utf8_code_length[ch];
1641
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001642 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001643 if (consumed)
1644 break;
1645 else {
1646 errmsg = "unexpected end of data";
1647 startinpos = s-starts;
1648 endinpos = size;
1649 goto utf8Error;
1650 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652
1653 switch (n) {
1654
1655 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001656 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001657 startinpos = s-starts;
1658 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001659 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660
1661 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001662 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001663 startinpos = s-starts;
1664 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001665 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666
1667 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001668 if ((s[1] & 0xc0) != 0x80) {
1669 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001670 startinpos = s-starts;
1671 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001672 goto utf8Error;
1673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001675 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001676 startinpos = s-starts;
1677 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001678 errmsg = "illegal encoding";
1679 goto utf8Error;
1680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001682 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683 break;
1684
1685 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001686 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001687 (s[2] & 0xc0) != 0x80) {
1688 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001689 startinpos = s-starts;
1690 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001691 goto utf8Error;
1692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001694 if (ch < 0x0800) {
1695 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001696 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001697
1698 XXX For wide builds (UCS-4) we should probably try
1699 to recombine the surrogates into a single code
1700 unit.
1701 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001702 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001703 startinpos = s-starts;
1704 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001705 goto utf8Error;
1706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001708 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001709 break;
1710
1711 case 4:
1712 if ((s[1] & 0xc0) != 0x80 ||
1713 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001714 (s[3] & 0xc0) != 0x80) {
1715 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001716 startinpos = s-starts;
1717 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001718 goto utf8Error;
1719 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001720 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1721 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1722 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001723 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001724 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001725 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001726 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001727 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001728 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 startinpos = s-starts;
1730 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001731 goto utf8Error;
1732 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001733#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001734 *p++ = (Py_UNICODE)ch;
1735#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001736 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001737
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001738 /* translate from 10000..10FFFF to 0..FFFF */
1739 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001740
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001741 /* high surrogate = top 10 bits added to D800 */
1742 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001743
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001744 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001745 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001746#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 break;
1748
1749 default:
1750 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001751 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 startinpos = s-starts;
1753 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001754 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755 }
1756 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001757 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001758
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001759 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 outpos = p-PyUnicode_AS_UNICODE(unicode);
1761 if (unicode_decode_call_errorhandler(
1762 errors, &errorHandler,
1763 "utf8", errmsg,
1764 starts, size, &startinpos, &endinpos, &exc, &s,
1765 (PyObject **)&unicode, &outpos, &p))
1766 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767 }
Walter Dörwald69652032004-09-07 20:24:22 +00001768 if (consumed)
1769 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
1771 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001772 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 goto onError;
1774
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 Py_XDECREF(errorHandler);
1776 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777 return (PyObject *)unicode;
1778
1779onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 Py_XDECREF(errorHandler);
1781 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 Py_DECREF(unicode);
1783 return NULL;
1784}
1785
Tim Peters602f7402002-04-27 18:03:26 +00001786/* Allocation strategy: if the string is short, convert into a stack buffer
1787 and allocate exactly as much space needed at the end. Else allocate the
1788 maximum possible needed (4 result bytes per Unicode character), and return
1789 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001790*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001791PyObject *
1792PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001793 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001794 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795{
Tim Peters602f7402002-04-27 18:03:26 +00001796#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001797
Martin v. Löwis18e16552006-02-15 17:27:45 +00001798 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001799 PyObject *v; /* result string object */
1800 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001801 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001802 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001803 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001804
Tim Peters602f7402002-04-27 18:03:26 +00001805 assert(s != NULL);
1806 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807
Tim Peters602f7402002-04-27 18:03:26 +00001808 if (size <= MAX_SHORT_UNICHARS) {
1809 /* Write into the stack buffer; nallocated can't overflow.
1810 * At the end, we'll allocate exactly as much heap space as it
1811 * turns out we need.
1812 */
1813 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1814 v = NULL; /* will allocate after we're done */
1815 p = stackbuf;
1816 }
1817 else {
1818 /* Overallocate on the heap, and give the excess back at the end. */
1819 nallocated = size * 4;
1820 if (nallocated / 4 != size) /* overflow! */
1821 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001822 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001823 if (v == NULL)
1824 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001825 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001826 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001827
Tim Peters602f7402002-04-27 18:03:26 +00001828 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001829 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001830
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001831 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001832 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001834
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001836 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001837 *p++ = (char)(0xc0 | (ch >> 6));
1838 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001839 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001840 else {
Tim Peters602f7402002-04-27 18:03:26 +00001841 /* Encode UCS2 Unicode ordinals */
1842 if (ch < 0x10000) {
1843 /* Special case: check for high surrogate */
1844 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1845 Py_UCS4 ch2 = s[i];
1846 /* Check for low surrogate and combine the two to
1847 form a UCS4 value */
1848 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001849 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001850 i++;
1851 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001852 }
Tim Peters602f7402002-04-27 18:03:26 +00001853 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001854 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001855 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001856 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1857 *p++ = (char)(0x80 | (ch & 0x3f));
1858 continue;
1859 }
1860encodeUCS4:
1861 /* Encode UCS4 Unicode ordinals */
1862 *p++ = (char)(0xf0 | (ch >> 18));
1863 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1864 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1865 *p++ = (char)(0x80 | (ch & 0x3f));
1866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001868
Tim Peters602f7402002-04-27 18:03:26 +00001869 if (v == NULL) {
1870 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001871 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001872 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001873 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001874 }
1875 else {
1876 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001877 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001878 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001879 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001880 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001881 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001882
Tim Peters602f7402002-04-27 18:03:26 +00001883#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884}
1885
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1887{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 if (!PyUnicode_Check(unicode)) {
1889 PyErr_BadArgument();
1890 return NULL;
1891 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001892 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1893 PyUnicode_GET_SIZE(unicode),
1894 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895}
1896
1897/* --- UTF-16 Codec ------------------------------------------------------- */
1898
Tim Peters772747b2001-08-09 22:21:55 +00001899PyObject *
1900PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001901 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001902 const char *errors,
1903 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904{
Walter Dörwald69652032004-09-07 20:24:22 +00001905 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1906}
1907
1908PyObject *
1909PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001910 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001911 const char *errors,
1912 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001913 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001916 Py_ssize_t startinpos;
1917 Py_ssize_t endinpos;
1918 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 PyUnicodeObject *unicode;
1920 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001921 const unsigned char *q, *e;
1922 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001923 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001924 /* Offsets from q for retrieving byte pairs in the right order. */
1925#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1926 int ihi = 1, ilo = 0;
1927#else
1928 int ihi = 0, ilo = 1;
1929#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001930 PyObject *errorHandler = NULL;
1931 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932
1933 /* Note: size will always be longer than the resulting Unicode
1934 character count */
1935 unicode = _PyUnicode_New(size);
1936 if (!unicode)
1937 return NULL;
1938 if (size == 0)
1939 return (PyObject *)unicode;
1940
1941 /* Unpack UTF-16 encoded data */
1942 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001943 q = (unsigned char *)s;
1944 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945
1946 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001947 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001949 /* Check for BOM marks (U+FEFF) in the input and adjust current
1950 byte order setting accordingly. In native mode, the leading BOM
1951 mark is skipped, in all other modes, it is copied to the output
1952 stream as-is (giving a ZWNBSP character). */
1953 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001954 if (size >= 2) {
1955 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001956#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001957 if (bom == 0xFEFF) {
1958 q += 2;
1959 bo = -1;
1960 }
1961 else if (bom == 0xFFFE) {
1962 q += 2;
1963 bo = 1;
1964 }
Tim Petersced69f82003-09-16 20:30:58 +00001965#else
Walter Dörwald69652032004-09-07 20:24:22 +00001966 if (bom == 0xFEFF) {
1967 q += 2;
1968 bo = 1;
1969 }
1970 else if (bom == 0xFFFE) {
1971 q += 2;
1972 bo = -1;
1973 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001974#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001975 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
Tim Peters772747b2001-08-09 22:21:55 +00001978 if (bo == -1) {
1979 /* force LE */
1980 ihi = 1;
1981 ilo = 0;
1982 }
1983 else if (bo == 1) {
1984 /* force BE */
1985 ihi = 0;
1986 ilo = 1;
1987 }
1988
1989 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001991 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001992 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001993 if (consumed)
1994 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001995 errmsg = "truncated data";
1996 startinpos = ((const char *)q)-starts;
1997 endinpos = ((const char *)e)-starts;
1998 goto utf16Error;
1999 /* The remaining input chars are ignored if the callback
2000 chooses to skip the input */
2001 }
2002 ch = (q[ihi] << 8) | q[ilo];
2003
Tim Peters772747b2001-08-09 22:21:55 +00002004 q += 2;
2005
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 if (ch < 0xD800 || ch > 0xDFFF) {
2007 *p++ = ch;
2008 continue;
2009 }
2010
2011 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002012 if (q >= e) {
2013 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002014 startinpos = (((const char *)q)-2)-starts;
2015 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002016 goto utf16Error;
2017 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002018 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002019 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2020 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002021 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002022#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002023 *p++ = ch;
2024 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002025#else
2026 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002027#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002028 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002029 }
2030 else {
2031 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 startinpos = (((const char *)q)-4)-starts;
2033 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002034 goto utf16Error;
2035 }
2036
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002038 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039 startinpos = (((const char *)q)-2)-starts;
2040 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002041 /* Fall through to report the error */
2042
2043 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 outpos = p-PyUnicode_AS_UNICODE(unicode);
2045 if (unicode_decode_call_errorhandler(
2046 errors, &errorHandler,
2047 "utf16", errmsg,
2048 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2049 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002050 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
2052
2053 if (byteorder)
2054 *byteorder = bo;
2055
Walter Dörwald69652032004-09-07 20:24:22 +00002056 if (consumed)
2057 *consumed = (const char *)q-starts;
2058
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002060 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 goto onError;
2062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 Py_XDECREF(errorHandler);
2064 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 return (PyObject *)unicode;
2066
2067onError:
2068 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 Py_XDECREF(errorHandler);
2070 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 return NULL;
2072}
2073
Tim Peters772747b2001-08-09 22:21:55 +00002074PyObject *
2075PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002076 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002077 const char *errors,
2078 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079{
2080 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002081 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002082#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002083 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002084#else
2085 const int pairs = 0;
2086#endif
Tim Peters772747b2001-08-09 22:21:55 +00002087 /* Offsets from p for storing byte pairs in the right order. */
2088#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2089 int ihi = 1, ilo = 0;
2090#else
2091 int ihi = 0, ilo = 1;
2092#endif
2093
2094#define STORECHAR(CH) \
2095 do { \
2096 p[ihi] = ((CH) >> 8) & 0xff; \
2097 p[ilo] = (CH) & 0xff; \
2098 p += 2; \
2099 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002102 for (i = pairs = 0; i < size; i++)
2103 if (s[i] >= 0x10000)
2104 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002105#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002106 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002107 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 if (v == NULL)
2109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110
Walter Dörwald3cc34522007-05-04 10:48:27 +00002111 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002113 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002114 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002115 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002116
2117 if (byteorder == -1) {
2118 /* force LE */
2119 ihi = 1;
2120 ilo = 0;
2121 }
2122 else if (byteorder == 1) {
2123 /* force BE */
2124 ihi = 0;
2125 ilo = 1;
2126 }
2127
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002128 while (size-- > 0) {
2129 Py_UNICODE ch = *s++;
2130 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002131#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002132 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002133 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2134 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002136#endif
Tim Peters772747b2001-08-09 22:21:55 +00002137 STORECHAR(ch);
2138 if (ch2)
2139 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002142#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143}
2144
2145PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2146{
2147 if (!PyUnicode_Check(unicode)) {
2148 PyErr_BadArgument();
2149 return NULL;
2150 }
2151 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2152 PyUnicode_GET_SIZE(unicode),
2153 NULL,
2154 0);
2155}
2156
2157/* --- Unicode Escape Codec ----------------------------------------------- */
2158
Fredrik Lundh06d12682001-01-24 07:59:11 +00002159static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002160
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002162 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 const char *errors)
2164{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002166 Py_ssize_t startinpos;
2167 Py_ssize_t endinpos;
2168 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002169 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002171 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002173 char* message;
2174 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002175 PyObject *errorHandler = NULL;
2176 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002177
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178 /* Escaped strings will always be longer than the resulting
2179 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002180 length after conversion to the true value.
2181 (but if the error callback returns a long replacement string
2182 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 v = _PyUnicode_New(size);
2184 if (v == NULL)
2185 goto onError;
2186 if (size == 0)
2187 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002189 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002191
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192 while (s < end) {
2193 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002194 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196
2197 /* Non-escape characters are interpreted as Unicode ordinals */
2198 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002199 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 continue;
2201 }
2202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002203 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 /* \ - Escapes */
2205 s++;
2206 switch (*s++) {
2207
2208 /* \x escapes */
2209 case '\n': break;
2210 case '\\': *p++ = '\\'; break;
2211 case '\'': *p++ = '\''; break;
2212 case '\"': *p++ = '\"'; break;
2213 case 'b': *p++ = '\b'; break;
2214 case 'f': *p++ = '\014'; break; /* FF */
2215 case 't': *p++ = '\t'; break;
2216 case 'n': *p++ = '\n'; break;
2217 case 'r': *p++ = '\r'; break;
2218 case 'v': *p++ = '\013'; break; /* VT */
2219 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2220
2221 /* \OOO (octal) escapes */
2222 case '0': case '1': case '2': case '3':
2223 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002224 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002225 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002226 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002228 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002230 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231 break;
2232
Fredrik Lundhccc74732001-02-18 22:13:49 +00002233 /* hex escapes */
2234 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002236 digits = 2;
2237 message = "truncated \\xXX escape";
2238 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239
Fredrik Lundhccc74732001-02-18 22:13:49 +00002240 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002242 digits = 4;
2243 message = "truncated \\uXXXX escape";
2244 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245
Fredrik Lundhccc74732001-02-18 22:13:49 +00002246 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002247 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002248 digits = 8;
2249 message = "truncated \\UXXXXXXXX escape";
2250 hexescape:
2251 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 outpos = p-PyUnicode_AS_UNICODE(v);
2253 if (s+digits>end) {
2254 endinpos = size;
2255 if (unicode_decode_call_errorhandler(
2256 errors, &errorHandler,
2257 "unicodeescape", "end of string in escape sequence",
2258 starts, size, &startinpos, &endinpos, &exc, &s,
2259 (PyObject **)&v, &outpos, &p))
2260 goto onError;
2261 goto nextByte;
2262 }
2263 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002264 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002265 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 endinpos = (s+i+1)-starts;
2267 if (unicode_decode_call_errorhandler(
2268 errors, &errorHandler,
2269 "unicodeescape", message,
2270 starts, size, &startinpos, &endinpos, &exc, &s,
2271 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002272 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002274 }
2275 chr = (chr<<4) & ~0xF;
2276 if (c >= '0' && c <= '9')
2277 chr += c - '0';
2278 else if (c >= 'a' && c <= 'f')
2279 chr += 10 + c - 'a';
2280 else
2281 chr += 10 + c - 'A';
2282 }
2283 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002284 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002285 /* _decoding_error will have already written into the
2286 target buffer. */
2287 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002288 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002289 /* when we get here, chr is a 32-bit unicode character */
2290 if (chr <= 0xffff)
2291 /* UCS-2 character */
2292 *p++ = (Py_UNICODE) chr;
2293 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002294 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002295 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002296#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002297 *p++ = chr;
2298#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002299 chr -= 0x10000L;
2300 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002301 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002302#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002303 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002304 endinpos = s-starts;
2305 outpos = p-PyUnicode_AS_UNICODE(v);
2306 if (unicode_decode_call_errorhandler(
2307 errors, &errorHandler,
2308 "unicodeescape", "illegal Unicode character",
2309 starts, size, &startinpos, &endinpos, &exc, &s,
2310 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002311 goto onError;
2312 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002313 break;
2314
2315 /* \N{name} */
2316 case 'N':
2317 message = "malformed \\N character escape";
2318 if (ucnhash_CAPI == NULL) {
2319 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002320 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002321 m = PyImport_ImportModule("unicodedata");
2322 if (m == NULL)
2323 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002324 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002325 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002326 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002327 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002328 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002329 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002330 if (ucnhash_CAPI == NULL)
2331 goto ucnhashError;
2332 }
2333 if (*s == '{') {
2334 const char *start = s+1;
2335 /* look for the closing brace */
2336 while (*s != '}' && s < end)
2337 s++;
2338 if (s > start && s < end && *s == '}') {
2339 /* found a name. look it up in the unicode database */
2340 message = "unknown Unicode character name";
2341 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002342 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002343 goto store;
2344 }
2345 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002346 endinpos = s-starts;
2347 outpos = p-PyUnicode_AS_UNICODE(v);
2348 if (unicode_decode_call_errorhandler(
2349 errors, &errorHandler,
2350 "unicodeescape", message,
2351 starts, size, &startinpos, &endinpos, &exc, &s,
2352 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002353 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002354 break;
2355
2356 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002357 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002358 message = "\\ at end of string";
2359 s--;
2360 endinpos = s-starts;
2361 outpos = p-PyUnicode_AS_UNICODE(v);
2362 if (unicode_decode_call_errorhandler(
2363 errors, &errorHandler,
2364 "unicodeescape", message,
2365 starts, size, &startinpos, &endinpos, &exc, &s,
2366 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002367 goto onError;
2368 }
2369 else {
2370 *p++ = '\\';
2371 *p++ = (unsigned char)s[-1];
2372 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002373 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002375 nextByte:
2376 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002378 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002379 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002380 Py_XDECREF(errorHandler);
2381 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002382 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002383
Fredrik Lundhccc74732001-02-18 22:13:49 +00002384ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002385 PyErr_SetString(
2386 PyExc_UnicodeError,
2387 "\\N escapes not supported (can't load unicodedata module)"
2388 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002389 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002390 Py_XDECREF(errorHandler);
2391 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002392 return NULL;
2393
Fredrik Lundhccc74732001-02-18 22:13:49 +00002394onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002396 Py_XDECREF(errorHandler);
2397 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 return NULL;
2399}
2400
2401/* Return a Unicode-Escape string version of the Unicode object.
2402
2403 If quotes is true, the string is enclosed in u"" or u'' quotes as
2404 appropriate.
2405
2406*/
2407
Thomas Wouters477c8d52006-05-27 19:21:47 +00002408Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2409 Py_ssize_t size,
2410 Py_UNICODE ch)
2411{
2412 /* like wcschr, but doesn't stop at NULL characters */
2413
2414 while (size-- > 0) {
2415 if (*s == ch)
2416 return s;
2417 s++;
2418 }
2419
2420 return NULL;
2421}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002422
Walter Dörwald79e913e2007-05-12 11:08:06 +00002423static const char *hexdigits = "0123456789abcdef";
2424
2425PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2426 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427{
2428 PyObject *repr;
2429 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430
Thomas Wouters89f507f2006-12-13 04:49:30 +00002431 /* XXX(nnorwitz): rather than over-allocating, it would be
2432 better to choose a different scheme. Perhaps scan the
2433 first N-chars of the string and allocate based on that size.
2434 */
2435 /* Initial allocation is based on the longest-possible unichr
2436 escape.
2437
2438 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2439 unichr, so in this case it's the longest unichr escape. In
2440 narrow (UTF-16) builds this is five chars per source unichr
2441 since there are two unichrs in the surrogate pair, so in narrow
2442 (UTF-16) builds it's not the longest unichr escape.
2443
2444 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2445 so in the narrow (UTF-16) build case it's the longest unichr
2446 escape.
2447 */
2448
Walter Dörwald79e913e2007-05-12 11:08:06 +00002449 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002450#ifdef Py_UNICODE_WIDE
2451 + 10*size
2452#else
2453 + 6*size
2454#endif
2455 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456 if (repr == NULL)
2457 return NULL;
2458
Walter Dörwald79e913e2007-05-12 11:08:06 +00002459 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 while (size-- > 0) {
2462 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002463
Walter Dörwald79e913e2007-05-12 11:08:06 +00002464 /* Escape backslashes */
2465 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 *p++ = '\\';
2467 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002468 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002469 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002470
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002471#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002472 /* Map 21-bit characters to '\U00xxxxxx' */
2473 else if (ch >= 0x10000) {
2474 *p++ = '\\';
2475 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002476 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2477 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2478 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2479 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2480 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2481 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2482 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2483 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002484 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002485 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002486#else
2487 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002488 else if (ch >= 0xD800 && ch < 0xDC00) {
2489 Py_UNICODE ch2;
2490 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002491
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002492 ch2 = *s++;
2493 size--;
2494 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2495 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2496 *p++ = '\\';
2497 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002498 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2499 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2500 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2501 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2502 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2503 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2504 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2505 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002506 continue;
2507 }
2508 /* Fall through: isolated surrogates are copied as-is */
2509 s--;
2510 size++;
2511 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002512#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002513
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002515 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 *p++ = '\\';
2517 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002518 *p++ = hexdigits[(ch >> 12) & 0x000F];
2519 *p++ = hexdigits[(ch >> 8) & 0x000F];
2520 *p++ = hexdigits[(ch >> 4) & 0x000F];
2521 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002523
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002524 /* Map special whitespace to '\t', \n', '\r' */
2525 else if (ch == '\t') {
2526 *p++ = '\\';
2527 *p++ = 't';
2528 }
2529 else if (ch == '\n') {
2530 *p++ = '\\';
2531 *p++ = 'n';
2532 }
2533 else if (ch == '\r') {
2534 *p++ = '\\';
2535 *p++ = 'r';
2536 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002537
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002538 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002539 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002541 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002542 *p++ = hexdigits[(ch >> 4) & 0x000F];
2543 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002544 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002545
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 /* Copy everything else as-is */
2547 else
2548 *p++ = (char) ch;
2549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550
2551 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002552 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2553 Py_DECREF(repr);
2554 return NULL;
2555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 return repr;
2557}
2558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2560{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002561 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 if (!PyUnicode_Check(unicode)) {
2563 PyErr_BadArgument();
2564 return NULL;
2565 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002566 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2567 PyUnicode_GET_SIZE(unicode));
2568
2569 if (!s)
2570 return NULL;
2571 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2572 PyBytes_GET_SIZE(s));
2573 Py_DECREF(s);
2574 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575}
2576
2577/* --- Raw Unicode Escape Codec ------------------------------------------- */
2578
2579PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002580 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 const char *errors)
2582{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002584 Py_ssize_t startinpos;
2585 Py_ssize_t endinpos;
2586 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 const char *end;
2590 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002591 PyObject *errorHandler = NULL;
2592 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002593
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 /* Escaped strings will always be longer than the resulting
2595 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002596 length after conversion to the true value. (But decoding error
2597 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 v = _PyUnicode_New(size);
2599 if (v == NULL)
2600 goto onError;
2601 if (size == 0)
2602 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002603 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 end = s + size;
2605 while (s < end) {
2606 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002607 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002609 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 /* Non-escape characters are interpreted as Unicode ordinals */
2612 if (*s != '\\') {
2613 *p++ = (unsigned char)*s++;
2614 continue;
2615 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002616 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617
2618 /* \u-escapes are only interpreted iff the number of leading
2619 backslashes if odd */
2620 bs = s;
2621 for (;s < end;) {
2622 if (*s != '\\')
2623 break;
2624 *p++ = (unsigned char)*s++;
2625 }
2626 if (((s - bs) & 1) == 0 ||
2627 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002628 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 continue;
2630 }
2631 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002632 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 s++;
2634
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002635 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002637 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002638 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 endinpos = s-starts;
2641 if (unicode_decode_call_errorhandler(
2642 errors, &errorHandler,
2643 "rawunicodeescape", "truncated \\uXXXX",
2644 starts, size, &startinpos, &endinpos, &exc, &s,
2645 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002647 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648 }
2649 x = (x<<4) & ~0xF;
2650 if (c >= '0' && c <= '9')
2651 x += c - '0';
2652 else if (c >= 'a' && c <= 'f')
2653 x += 10 + c - 'a';
2654 else
2655 x += 10 + c - 'A';
2656 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002657#ifndef Py_UNICODE_WIDE
2658 if (x > 0x10000) {
2659 if (unicode_decode_call_errorhandler(
2660 errors, &errorHandler,
2661 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2662 starts, size, &startinpos, &endinpos, &exc, &s,
2663 (PyObject **)&v, &outpos, &p))
2664 goto onError;
2665 }
2666#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 *p++ = x;
2668 nextByte:
2669 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002671 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002672 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002673 Py_XDECREF(errorHandler);
2674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002676
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 onError:
2678 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 Py_XDECREF(errorHandler);
2680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 return NULL;
2682}
2683
2684PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002685 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686{
2687 PyObject *repr;
2688 char *p;
2689 char *q;
2690
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002691#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002692 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002693#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002694 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002695#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 if (repr == NULL)
2697 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002698 if (size == 0)
2699 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700
Walter Dörwald711005d2007-05-12 12:03:26 +00002701 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 while (size-- > 0) {
2703 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002704#ifdef Py_UNICODE_WIDE
2705 /* Map 32-bit characters to '\Uxxxxxxxx' */
2706 if (ch >= 0x10000) {
2707 *p++ = '\\';
2708 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002709 *p++ = hexdigits[(ch >> 28) & 0xf];
2710 *p++ = hexdigits[(ch >> 24) & 0xf];
2711 *p++ = hexdigits[(ch >> 20) & 0xf];
2712 *p++ = hexdigits[(ch >> 16) & 0xf];
2713 *p++ = hexdigits[(ch >> 12) & 0xf];
2714 *p++ = hexdigits[(ch >> 8) & 0xf];
2715 *p++ = hexdigits[(ch >> 4) & 0xf];
2716 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002717 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002718 else
2719#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 /* Map 16-bit characters to '\uxxxx' */
2721 if (ch >= 256) {
2722 *p++ = '\\';
2723 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002724 *p++ = hexdigits[(ch >> 12) & 0xf];
2725 *p++ = hexdigits[(ch >> 8) & 0xf];
2726 *p++ = hexdigits[(ch >> 4) & 0xf];
2727 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 }
2729 /* Copy everything else as-is */
2730 else
2731 *p++ = (char) ch;
2732 }
2733 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002734 if (PyBytes_Resize(repr, p - q)) {
2735 Py_DECREF(repr);
2736 return NULL;
2737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 return repr;
2739}
2740
2741PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2742{
Walter Dörwald711005d2007-05-12 12:03:26 +00002743 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002745 PyErr_BadArgument();
2746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002748 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2749 PyUnicode_GET_SIZE(unicode));
2750
2751 if (!s)
2752 return NULL;
2753 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2754 PyBytes_GET_SIZE(s));
2755 Py_DECREF(s);
2756 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757}
2758
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002759/* --- Unicode Internal Codec ------------------------------------------- */
2760
2761PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002762 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002763 const char *errors)
2764{
2765 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002766 Py_ssize_t startinpos;
2767 Py_ssize_t endinpos;
2768 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002769 PyUnicodeObject *v;
2770 Py_UNICODE *p;
2771 const char *end;
2772 const char *reason;
2773 PyObject *errorHandler = NULL;
2774 PyObject *exc = NULL;
2775
Neal Norwitzd43069c2006-01-08 01:12:10 +00002776#ifdef Py_UNICODE_WIDE
2777 Py_UNICODE unimax = PyUnicode_GetMax();
2778#endif
2779
Thomas Wouters89f507f2006-12-13 04:49:30 +00002780 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002781 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2782 if (v == NULL)
2783 goto onError;
2784 if (PyUnicode_GetSize((PyObject *)v) == 0)
2785 return (PyObject *)v;
2786 p = PyUnicode_AS_UNICODE(v);
2787 end = s + size;
2788
2789 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002790 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002791 /* We have to sanity check the raw data, otherwise doom looms for
2792 some malformed UCS-4 data. */
2793 if (
2794 #ifdef Py_UNICODE_WIDE
2795 *p > unimax || *p < 0 ||
2796 #endif
2797 end-s < Py_UNICODE_SIZE
2798 )
2799 {
2800 startinpos = s - starts;
2801 if (end-s < Py_UNICODE_SIZE) {
2802 endinpos = end-starts;
2803 reason = "truncated input";
2804 }
2805 else {
2806 endinpos = s - starts + Py_UNICODE_SIZE;
2807 reason = "illegal code point (> 0x10FFFF)";
2808 }
2809 outpos = p - PyUnicode_AS_UNICODE(v);
2810 if (unicode_decode_call_errorhandler(
2811 errors, &errorHandler,
2812 "unicode_internal", reason,
2813 starts, size, &startinpos, &endinpos, &exc, &s,
2814 (PyObject **)&v, &outpos, &p)) {
2815 goto onError;
2816 }
2817 }
2818 else {
2819 p++;
2820 s += Py_UNICODE_SIZE;
2821 }
2822 }
2823
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002824 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002825 goto onError;
2826 Py_XDECREF(errorHandler);
2827 Py_XDECREF(exc);
2828 return (PyObject *)v;
2829
2830 onError:
2831 Py_XDECREF(v);
2832 Py_XDECREF(errorHandler);
2833 Py_XDECREF(exc);
2834 return NULL;
2835}
2836
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837/* --- Latin-1 Codec ------------------------------------------------------ */
2838
2839PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002840 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 const char *errors)
2842{
2843 PyUnicodeObject *v;
2844 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002845
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002847 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002848 Py_UNICODE r = *(unsigned char*)s;
2849 return PyUnicode_FromUnicode(&r, 1);
2850 }
2851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 v = _PyUnicode_New(size);
2853 if (v == NULL)
2854 goto onError;
2855 if (size == 0)
2856 return (PyObject *)v;
2857 p = PyUnicode_AS_UNICODE(v);
2858 while (size-- > 0)
2859 *p++ = (unsigned char)*s++;
2860 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002861
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 onError:
2863 Py_XDECREF(v);
2864 return NULL;
2865}
2866
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867/* create or adjust a UnicodeEncodeError */
2868static void make_encode_exception(PyObject **exceptionObject,
2869 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002870 const Py_UNICODE *unicode, Py_ssize_t size,
2871 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002873{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874 if (*exceptionObject == NULL) {
2875 *exceptionObject = PyUnicodeEncodeError_Create(
2876 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 }
2878 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2880 goto onError;
2881 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2882 goto onError;
2883 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2884 goto onError;
2885 return;
2886 onError:
2887 Py_DECREF(*exceptionObject);
2888 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 }
2890}
2891
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002892/* raises a UnicodeEncodeError */
2893static void raise_encode_exception(PyObject **exceptionObject,
2894 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002895 const Py_UNICODE *unicode, Py_ssize_t size,
2896 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002897 const char *reason)
2898{
2899 make_encode_exception(exceptionObject,
2900 encoding, unicode, size, startpos, endpos, reason);
2901 if (*exceptionObject != NULL)
2902 PyCodec_StrictErrors(*exceptionObject);
2903}
2904
2905/* error handling callback helper:
2906 build arguments, call the callback and check the arguments,
2907 put the result into newpos and return the replacement string, which
2908 has to be freed by the caller */
2909static PyObject *unicode_encode_call_errorhandler(const char *errors,
2910 PyObject **errorHandler,
2911 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002912 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2913 Py_ssize_t startpos, Py_ssize_t endpos,
2914 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002915{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002916 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002917
2918 PyObject *restuple;
2919 PyObject *resunicode;
2920
2921 if (*errorHandler == NULL) {
2922 *errorHandler = PyCodec_LookupError(errors);
2923 if (*errorHandler == NULL)
2924 return NULL;
2925 }
2926
2927 make_encode_exception(exceptionObject,
2928 encoding, unicode, size, startpos, endpos, reason);
2929 if (*exceptionObject == NULL)
2930 return NULL;
2931
2932 restuple = PyObject_CallFunctionObjArgs(
2933 *errorHandler, *exceptionObject, NULL);
2934 if (restuple == NULL)
2935 return NULL;
2936 if (!PyTuple_Check(restuple)) {
2937 PyErr_Format(PyExc_TypeError, &argparse[4]);
2938 Py_DECREF(restuple);
2939 return NULL;
2940 }
2941 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2942 &resunicode, newpos)) {
2943 Py_DECREF(restuple);
2944 return NULL;
2945 }
2946 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002947 *newpos = size+*newpos;
2948 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002949 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002950 Py_DECREF(restuple);
2951 return NULL;
2952 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002953 Py_INCREF(resunicode);
2954 Py_DECREF(restuple);
2955 return resunicode;
2956}
2957
2958static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002959 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 const char *errors,
2961 int limit)
2962{
2963 /* output object */
2964 PyObject *res;
2965 /* pointers to the beginning and end+1 of input */
2966 const Py_UNICODE *startp = p;
2967 const Py_UNICODE *endp = p + size;
2968 /* pointer to the beginning of the unencodable characters */
2969 /* const Py_UNICODE *badp = NULL; */
2970 /* pointer into the output */
2971 char *str;
2972 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002973 Py_ssize_t respos = 0;
2974 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002975 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2976 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 PyObject *errorHandler = NULL;
2978 PyObject *exc = NULL;
2979 /* the following variable is used for caching string comparisons
2980 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2981 int known_errorHandler = -1;
2982
2983 /* allocate enough for a simple encoding without
2984 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002985 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 if (res == NULL)
2987 goto onError;
2988 if (size == 0)
2989 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00002990 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002991 ressize = size;
2992
2993 while (p<endp) {
2994 Py_UNICODE c = *p;
2995
2996 /* can we encode this? */
2997 if (c<limit) {
2998 /* no overflow check, because we know that the space is enough */
2999 *str++ = (char)c;
3000 ++p;
3001 }
3002 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003003 Py_ssize_t unicodepos = p-startp;
3004 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003006 Py_ssize_t repsize;
3007 Py_ssize_t newpos;
3008 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003009 Py_UNICODE *uni2;
3010 /* startpos for collecting unencodable chars */
3011 const Py_UNICODE *collstart = p;
3012 const Py_UNICODE *collend = p;
3013 /* find all unecodable characters */
3014 while ((collend < endp) && ((*collend)>=limit))
3015 ++collend;
3016 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3017 if (known_errorHandler==-1) {
3018 if ((errors==NULL) || (!strcmp(errors, "strict")))
3019 known_errorHandler = 1;
3020 else if (!strcmp(errors, "replace"))
3021 known_errorHandler = 2;
3022 else if (!strcmp(errors, "ignore"))
3023 known_errorHandler = 3;
3024 else if (!strcmp(errors, "xmlcharrefreplace"))
3025 known_errorHandler = 4;
3026 else
3027 known_errorHandler = 0;
3028 }
3029 switch (known_errorHandler) {
3030 case 1: /* strict */
3031 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3032 goto onError;
3033 case 2: /* replace */
3034 while (collstart++<collend)
3035 *str++ = '?'; /* fall through */
3036 case 3: /* ignore */
3037 p = collend;
3038 break;
3039 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003040 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 /* determine replacement size (temporarily (mis)uses p) */
3042 for (p = collstart, repsize = 0; p < collend; ++p) {
3043 if (*p<10)
3044 repsize += 2+1+1;
3045 else if (*p<100)
3046 repsize += 2+2+1;
3047 else if (*p<1000)
3048 repsize += 2+3+1;
3049 else if (*p<10000)
3050 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003051#ifndef Py_UNICODE_WIDE
3052 else
3053 repsize += 2+5+1;
3054#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 else if (*p<100000)
3056 repsize += 2+5+1;
3057 else if (*p<1000000)
3058 repsize += 2+6+1;
3059 else
3060 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003061#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 }
3063 requiredsize = respos+repsize+(endp-collend);
3064 if (requiredsize > ressize) {
3065 if (requiredsize<2*ressize)
3066 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003067 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003069 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 ressize = requiredsize;
3071 }
3072 /* generate replacement (temporarily (mis)uses p) */
3073 for (p = collstart; p < collend; ++p) {
3074 str += sprintf(str, "&#%d;", (int)*p);
3075 }
3076 p = collend;
3077 break;
3078 default:
3079 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3080 encoding, reason, startp, size, &exc,
3081 collstart-startp, collend-startp, &newpos);
3082 if (repunicode == NULL)
3083 goto onError;
3084 /* need more space? (at least enough for what we
3085 have+the replacement+the rest of the string, so
3086 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003087 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 repsize = PyUnicode_GET_SIZE(repunicode);
3089 requiredsize = respos+repsize+(endp-collend);
3090 if (requiredsize > ressize) {
3091 if (requiredsize<2*ressize)
3092 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003093 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 Py_DECREF(repunicode);
3095 goto onError;
3096 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003097 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 ressize = requiredsize;
3099 }
3100 /* check if there is anything unencodable in the replacement
3101 and copy it to the output */
3102 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3103 c = *uni2;
3104 if (c >= limit) {
3105 raise_encode_exception(&exc, encoding, startp, size,
3106 unicodepos, unicodepos+1, reason);
3107 Py_DECREF(repunicode);
3108 goto onError;
3109 }
3110 *str = (char)c;
3111 }
3112 p = startp + newpos;
3113 Py_DECREF(repunicode);
3114 }
3115 }
3116 }
3117 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003118 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119 if (respos<ressize)
3120 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003121 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003122 Py_XDECREF(errorHandler);
3123 Py_XDECREF(exc);
3124 return res;
3125
3126 onError:
3127 Py_XDECREF(res);
3128 Py_XDECREF(errorHandler);
3129 Py_XDECREF(exc);
3130 return NULL;
3131}
3132
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003134 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 const char *errors)
3136{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003137 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138}
3139
3140PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3141{
3142 if (!PyUnicode_Check(unicode)) {
3143 PyErr_BadArgument();
3144 return NULL;
3145 }
3146 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3147 PyUnicode_GET_SIZE(unicode),
3148 NULL);
3149}
3150
3151/* --- 7-bit ASCII Codec -------------------------------------------------- */
3152
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003154 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 const char *errors)
3156{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 PyUnicodeObject *v;
3159 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003160 Py_ssize_t startinpos;
3161 Py_ssize_t endinpos;
3162 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003163 const char *e;
3164 PyObject *errorHandler = NULL;
3165 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003166
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003168 if (size == 1 && *(unsigned char*)s < 128) {
3169 Py_UNICODE r = *(unsigned char*)s;
3170 return PyUnicode_FromUnicode(&r, 1);
3171 }
Tim Petersced69f82003-09-16 20:30:58 +00003172
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 v = _PyUnicode_New(size);
3174 if (v == NULL)
3175 goto onError;
3176 if (size == 0)
3177 return (PyObject *)v;
3178 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179 e = s + size;
3180 while (s < e) {
3181 register unsigned char c = (unsigned char)*s;
3182 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003184 ++s;
3185 }
3186 else {
3187 startinpos = s-starts;
3188 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003189 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190 if (unicode_decode_call_errorhandler(
3191 errors, &errorHandler,
3192 "ascii", "ordinal not in range(128)",
3193 starts, size, &startinpos, &endinpos, &exc, &s,
3194 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003198 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003199 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003200 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 Py_XDECREF(errorHandler);
3202 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003204
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 onError:
3206 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003207 Py_XDECREF(errorHandler);
3208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 return NULL;
3210}
3211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003213 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 const char *errors)
3215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003216 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217}
3218
3219PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3220{
3221 if (!PyUnicode_Check(unicode)) {
3222 PyErr_BadArgument();
3223 return NULL;
3224 }
3225 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3226 PyUnicode_GET_SIZE(unicode),
3227 NULL);
3228}
3229
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003230#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003231
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003232/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003233
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003234#if SIZEOF_INT < SIZEOF_SSIZE_T
3235#define NEED_RETRY
3236#endif
3237
3238/* XXX This code is limited to "true" double-byte encodings, as
3239 a) it assumes an incomplete character consists of a single byte, and
3240 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3241 encodings, see IsDBCSLeadByteEx documentation. */
3242
3243static int is_dbcs_lead_byte(const char *s, int offset)
3244{
3245 const char *curr = s + offset;
3246
3247 if (IsDBCSLeadByte(*curr)) {
3248 const char *prev = CharPrev(s, curr);
3249 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3250 }
3251 return 0;
3252}
3253
3254/*
3255 * Decode MBCS string into unicode object. If 'final' is set, converts
3256 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3257 */
3258static int decode_mbcs(PyUnicodeObject **v,
3259 const char *s, /* MBCS string */
3260 int size, /* sizeof MBCS string */
3261 int final)
3262{
3263 Py_UNICODE *p;
3264 Py_ssize_t n = 0;
3265 int usize = 0;
3266
3267 assert(size >= 0);
3268
3269 /* Skip trailing lead-byte unless 'final' is set */
3270 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3271 --size;
3272
3273 /* First get the size of the result */
3274 if (size > 0) {
3275 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3276 if (usize == 0) {
3277 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3278 return -1;
3279 }
3280 }
3281
3282 if (*v == NULL) {
3283 /* Create unicode object */
3284 *v = _PyUnicode_New(usize);
3285 if (*v == NULL)
3286 return -1;
3287 }
3288 else {
3289 /* Extend unicode object */
3290 n = PyUnicode_GET_SIZE(*v);
3291 if (_PyUnicode_Resize(v, n + usize) < 0)
3292 return -1;
3293 }
3294
3295 /* Do the conversion */
3296 if (size > 0) {
3297 p = PyUnicode_AS_UNICODE(*v) + n;
3298 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3299 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3300 return -1;
3301 }
3302 }
3303
3304 return size;
3305}
3306
3307PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3308 Py_ssize_t size,
3309 const char *errors,
3310 Py_ssize_t *consumed)
3311{
3312 PyUnicodeObject *v = NULL;
3313 int done;
3314
3315 if (consumed)
3316 *consumed = 0;
3317
3318#ifdef NEED_RETRY
3319 retry:
3320 if (size > INT_MAX)
3321 done = decode_mbcs(&v, s, INT_MAX, 0);
3322 else
3323#endif
3324 done = decode_mbcs(&v, s, (int)size, !consumed);
3325
3326 if (done < 0) {
3327 Py_XDECREF(v);
3328 return NULL;
3329 }
3330
3331 if (consumed)
3332 *consumed += done;
3333
3334#ifdef NEED_RETRY
3335 if (size > INT_MAX) {
3336 s += done;
3337 size -= done;
3338 goto retry;
3339 }
3340#endif
3341
3342 return (PyObject *)v;
3343}
3344
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003345PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003346 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003347 const char *errors)
3348{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003349 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3350}
3351
3352/*
3353 * Convert unicode into string object (MBCS).
3354 * Returns 0 if succeed, -1 otherwise.
3355 */
3356static int encode_mbcs(PyObject **repr,
3357 const Py_UNICODE *p, /* unicode */
3358 int size) /* size of unicode */
3359{
3360 int mbcssize = 0;
3361 Py_ssize_t n = 0;
3362
3363 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003364
3365 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003366 if (size > 0) {
3367 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3368 if (mbcssize == 0) {
3369 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3370 return -1;
3371 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003372 }
3373
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003374 if (*repr == NULL) {
3375 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003376 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003377 if (*repr == NULL)
3378 return -1;
3379 }
3380 else {
3381 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003382 n = PyBytes_Size(*repr);
3383 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003384 return -1;
3385 }
3386
3387 /* Do the conversion */
3388 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003389 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003390 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3391 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3392 return -1;
3393 }
3394 }
3395
3396 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003397}
3398
3399PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003400 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003401 const char *errors)
3402{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003403 PyObject *repr = NULL;
3404 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003405
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003406#ifdef NEED_RETRY
3407 retry:
3408 if (size > INT_MAX)
3409 ret = encode_mbcs(&repr, p, INT_MAX);
3410 else
3411#endif
3412 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003413
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003414 if (ret < 0) {
3415 Py_XDECREF(repr);
3416 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003417 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003418
3419#ifdef NEED_RETRY
3420 if (size > INT_MAX) {
3421 p += INT_MAX;
3422 size -= INT_MAX;
3423 goto retry;
3424 }
3425#endif
3426
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003427 return repr;
3428}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003429
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003430PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3431{
3432 if (!PyUnicode_Check(unicode)) {
3433 PyErr_BadArgument();
3434 return NULL;
3435 }
3436 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3437 PyUnicode_GET_SIZE(unicode),
3438 NULL);
3439}
3440
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003441#undef NEED_RETRY
3442
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003443#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003444
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445/* --- Character Mapping Codec -------------------------------------------- */
3446
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003448 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 PyObject *mapping,
3450 const char *errors)
3451{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003453 Py_ssize_t startinpos;
3454 Py_ssize_t endinpos;
3455 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003456 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 PyUnicodeObject *v;
3458 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003459 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 PyObject *errorHandler = NULL;
3461 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003462 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003463 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003464
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 /* Default to Latin-1 */
3466 if (mapping == NULL)
3467 return PyUnicode_DecodeLatin1(s, size, errors);
3468
3469 v = _PyUnicode_New(size);
3470 if (v == NULL)
3471 goto onError;
3472 if (size == 0)
3473 return (PyObject *)v;
3474 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003476 if (PyUnicode_CheckExact(mapping)) {
3477 mapstring = PyUnicode_AS_UNICODE(mapping);
3478 maplen = PyUnicode_GET_SIZE(mapping);
3479 while (s < e) {
3480 unsigned char ch = *s;
3481 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003483 if (ch < maplen)
3484 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003486 if (x == 0xfffe) {
3487 /* undefined mapping */
3488 outpos = p-PyUnicode_AS_UNICODE(v);
3489 startinpos = s-starts;
3490 endinpos = startinpos+1;
3491 if (unicode_decode_call_errorhandler(
3492 errors, &errorHandler,
3493 "charmap", "character maps to <undefined>",
3494 starts, size, &startinpos, &endinpos, &exc, &s,
3495 (PyObject **)&v, &outpos, &p)) {
3496 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003497 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003498 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003499 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003500 *p++ = x;
3501 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003503 }
3504 else {
3505 while (s < e) {
3506 unsigned char ch = *s;
3507 PyObject *w, *x;
3508
3509 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3510 w = PyInt_FromLong((long)ch);
3511 if (w == NULL)
3512 goto onError;
3513 x = PyObject_GetItem(mapping, w);
3514 Py_DECREF(w);
3515 if (x == NULL) {
3516 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3517 /* No mapping found means: mapping is undefined. */
3518 PyErr_Clear();
3519 x = Py_None;
3520 Py_INCREF(x);
3521 } else
3522 goto onError;
3523 }
3524
3525 /* Apply mapping */
3526 if (PyInt_Check(x)) {
3527 long value = PyInt_AS_LONG(x);
3528 if (value < 0 || value > 65535) {
3529 PyErr_SetString(PyExc_TypeError,
3530 "character mapping must be in range(65536)");
3531 Py_DECREF(x);
3532 goto onError;
3533 }
3534 *p++ = (Py_UNICODE)value;
3535 }
3536 else if (x == Py_None) {
3537 /* undefined mapping */
3538 outpos = p-PyUnicode_AS_UNICODE(v);
3539 startinpos = s-starts;
3540 endinpos = startinpos+1;
3541 if (unicode_decode_call_errorhandler(
3542 errors, &errorHandler,
3543 "charmap", "character maps to <undefined>",
3544 starts, size, &startinpos, &endinpos, &exc, &s,
3545 (PyObject **)&v, &outpos, &p)) {
3546 Py_DECREF(x);
3547 goto onError;
3548 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003549 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003550 continue;
3551 }
3552 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003553 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003554
3555 if (targetsize == 1)
3556 /* 1-1 mapping */
3557 *p++ = *PyUnicode_AS_UNICODE(x);
3558
3559 else if (targetsize > 1) {
3560 /* 1-n mapping */
3561 if (targetsize > extrachars) {
3562 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003563 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3564 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003565 (targetsize << 2);
3566 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003567 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003568 if (_PyUnicode_Resize(&v,
3569 PyUnicode_GET_SIZE(v) + needed) < 0) {
3570 Py_DECREF(x);
3571 goto onError;
3572 }
3573 p = PyUnicode_AS_UNICODE(v) + oldpos;
3574 }
3575 Py_UNICODE_COPY(p,
3576 PyUnicode_AS_UNICODE(x),
3577 targetsize);
3578 p += targetsize;
3579 extrachars -= targetsize;
3580 }
3581 /* 1-0 mapping: skip the character */
3582 }
3583 else {
3584 /* wrong return value */
3585 PyErr_SetString(PyExc_TypeError,
3586 "character mapping must return integer, None or unicode");
3587 Py_DECREF(x);
3588 goto onError;
3589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003591 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 }
3594 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003595 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 Py_XDECREF(errorHandler);
3598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003600
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 Py_XDECREF(errorHandler);
3603 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 Py_XDECREF(v);
3605 return NULL;
3606}
3607
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003608/* Charmap encoding: the lookup table */
3609
3610struct encoding_map{
3611 PyObject_HEAD
3612 unsigned char level1[32];
3613 int count2, count3;
3614 unsigned char level23[1];
3615};
3616
3617static PyObject*
3618encoding_map_size(PyObject *obj, PyObject* args)
3619{
3620 struct encoding_map *map = (struct encoding_map*)obj;
3621 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3622 128*map->count3);
3623}
3624
3625static PyMethodDef encoding_map_methods[] = {
3626 {"size", encoding_map_size, METH_NOARGS,
3627 PyDoc_STR("Return the size (in bytes) of this object") },
3628 { 0 }
3629};
3630
3631static void
3632encoding_map_dealloc(PyObject* o)
3633{
3634 PyObject_FREE(o);
3635}
3636
3637static PyTypeObject EncodingMapType = {
3638 PyObject_HEAD_INIT(NULL)
3639 0, /*ob_size*/
3640 "EncodingMap", /*tp_name*/
3641 sizeof(struct encoding_map), /*tp_basicsize*/
3642 0, /*tp_itemsize*/
3643 /* methods */
3644 encoding_map_dealloc, /*tp_dealloc*/
3645 0, /*tp_print*/
3646 0, /*tp_getattr*/
3647 0, /*tp_setattr*/
3648 0, /*tp_compare*/
3649 0, /*tp_repr*/
3650 0, /*tp_as_number*/
3651 0, /*tp_as_sequence*/
3652 0, /*tp_as_mapping*/
3653 0, /*tp_hash*/
3654 0, /*tp_call*/
3655 0, /*tp_str*/
3656 0, /*tp_getattro*/
3657 0, /*tp_setattro*/
3658 0, /*tp_as_buffer*/
3659 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3660 0, /*tp_doc*/
3661 0, /*tp_traverse*/
3662 0, /*tp_clear*/
3663 0, /*tp_richcompare*/
3664 0, /*tp_weaklistoffset*/
3665 0, /*tp_iter*/
3666 0, /*tp_iternext*/
3667 encoding_map_methods, /*tp_methods*/
3668 0, /*tp_members*/
3669 0, /*tp_getset*/
3670 0, /*tp_base*/
3671 0, /*tp_dict*/
3672 0, /*tp_descr_get*/
3673 0, /*tp_descr_set*/
3674 0, /*tp_dictoffset*/
3675 0, /*tp_init*/
3676 0, /*tp_alloc*/
3677 0, /*tp_new*/
3678 0, /*tp_free*/
3679 0, /*tp_is_gc*/
3680};
3681
3682PyObject*
3683PyUnicode_BuildEncodingMap(PyObject* string)
3684{
3685 Py_UNICODE *decode;
3686 PyObject *result;
3687 struct encoding_map *mresult;
3688 int i;
3689 int need_dict = 0;
3690 unsigned char level1[32];
3691 unsigned char level2[512];
3692 unsigned char *mlevel1, *mlevel2, *mlevel3;
3693 int count2 = 0, count3 = 0;
3694
3695 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3696 PyErr_BadArgument();
3697 return NULL;
3698 }
3699 decode = PyUnicode_AS_UNICODE(string);
3700 memset(level1, 0xFF, sizeof level1);
3701 memset(level2, 0xFF, sizeof level2);
3702
3703 /* If there isn't a one-to-one mapping of NULL to \0,
3704 or if there are non-BMP characters, we need to use
3705 a mapping dictionary. */
3706 if (decode[0] != 0)
3707 need_dict = 1;
3708 for (i = 1; i < 256; i++) {
3709 int l1, l2;
3710 if (decode[i] == 0
3711 #ifdef Py_UNICODE_WIDE
3712 || decode[i] > 0xFFFF
3713 #endif
3714 ) {
3715 need_dict = 1;
3716 break;
3717 }
3718 if (decode[i] == 0xFFFE)
3719 /* unmapped character */
3720 continue;
3721 l1 = decode[i] >> 11;
3722 l2 = decode[i] >> 7;
3723 if (level1[l1] == 0xFF)
3724 level1[l1] = count2++;
3725 if (level2[l2] == 0xFF)
3726 level2[l2] = count3++;
3727 }
3728
3729 if (count2 >= 0xFF || count3 >= 0xFF)
3730 need_dict = 1;
3731
3732 if (need_dict) {
3733 PyObject *result = PyDict_New();
3734 PyObject *key, *value;
3735 if (!result)
3736 return NULL;
3737 for (i = 0; i < 256; i++) {
3738 key = value = NULL;
3739 key = PyInt_FromLong(decode[i]);
3740 value = PyInt_FromLong(i);
3741 if (!key || !value)
3742 goto failed1;
3743 if (PyDict_SetItem(result, key, value) == -1)
3744 goto failed1;
3745 Py_DECREF(key);
3746 Py_DECREF(value);
3747 }
3748 return result;
3749 failed1:
3750 Py_XDECREF(key);
3751 Py_XDECREF(value);
3752 Py_DECREF(result);
3753 return NULL;
3754 }
3755
3756 /* Create a three-level trie */
3757 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3758 16*count2 + 128*count3 - 1);
3759 if (!result)
3760 return PyErr_NoMemory();
3761 PyObject_Init(result, &EncodingMapType);
3762 mresult = (struct encoding_map*)result;
3763 mresult->count2 = count2;
3764 mresult->count3 = count3;
3765 mlevel1 = mresult->level1;
3766 mlevel2 = mresult->level23;
3767 mlevel3 = mresult->level23 + 16*count2;
3768 memcpy(mlevel1, level1, 32);
3769 memset(mlevel2, 0xFF, 16*count2);
3770 memset(mlevel3, 0, 128*count3);
3771 count3 = 0;
3772 for (i = 1; i < 256; i++) {
3773 int o1, o2, o3, i2, i3;
3774 if (decode[i] == 0xFFFE)
3775 /* unmapped character */
3776 continue;
3777 o1 = decode[i]>>11;
3778 o2 = (decode[i]>>7) & 0xF;
3779 i2 = 16*mlevel1[o1] + o2;
3780 if (mlevel2[i2] == 0xFF)
3781 mlevel2[i2] = count3++;
3782 o3 = decode[i] & 0x7F;
3783 i3 = 128*mlevel2[i2] + o3;
3784 mlevel3[i3] = i;
3785 }
3786 return result;
3787}
3788
3789static int
3790encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3791{
3792 struct encoding_map *map = (struct encoding_map*)mapping;
3793 int l1 = c>>11;
3794 int l2 = (c>>7) & 0xF;
3795 int l3 = c & 0x7F;
3796 int i;
3797
3798#ifdef Py_UNICODE_WIDE
3799 if (c > 0xFFFF) {
3800 return -1;
3801 }
3802#endif
3803 if (c == 0)
3804 return 0;
3805 /* level 1*/
3806 i = map->level1[l1];
3807 if (i == 0xFF) {
3808 return -1;
3809 }
3810 /* level 2*/
3811 i = map->level23[16*i+l2];
3812 if (i == 0xFF) {
3813 return -1;
3814 }
3815 /* level 3 */
3816 i = map->level23[16*map->count2 + 128*i + l3];
3817 if (i == 0) {
3818 return -1;
3819 }
3820 return i;
3821}
3822
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823/* Lookup the character ch in the mapping. If the character
3824 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003825 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003826static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 PyObject *w = PyInt_FromLong((long)c);
3829 PyObject *x;
3830
3831 if (w == NULL)
3832 return NULL;
3833 x = PyObject_GetItem(mapping, w);
3834 Py_DECREF(w);
3835 if (x == NULL) {
3836 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3837 /* No mapping found means: mapping is undefined. */
3838 PyErr_Clear();
3839 x = Py_None;
3840 Py_INCREF(x);
3841 return x;
3842 } else
3843 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003845 else if (x == Py_None)
3846 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 else if (PyInt_Check(x)) {
3848 long value = PyInt_AS_LONG(x);
3849 if (value < 0 || value > 255) {
3850 PyErr_SetString(PyExc_TypeError,
3851 "character mapping must be in range(256)");
3852 Py_DECREF(x);
3853 return NULL;
3854 }
3855 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 else if (PyString_Check(x))
3858 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003861 PyErr_Format(PyExc_TypeError,
3862 "character mapping must return integer, None or str8, not %.400s",
3863 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 Py_DECREF(x);
3865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 }
3867}
3868
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003869static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003870charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003871{
Walter Dörwald827b0552007-05-12 13:23:53 +00003872 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003873 /* exponentially overallocate to minimize reallocations */
3874 if (requiredsize < 2*outsize)
3875 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003876 if (PyBytes_Resize(outobj, requiredsize)) {
3877 Py_DECREF(outobj);
3878 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003879 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003880 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003881}
3882
3883typedef enum charmapencode_result {
3884 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3885}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003887 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 space is available. Return a new reference to the object that
3889 was put in the output buffer, or Py_None, if the mapping was undefined
3890 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003891 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003893charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003894 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003895{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003896 PyObject *rep;
3897 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003898 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003900 if (mapping->ob_type == &EncodingMapType) {
3901 int res = encoding_map_lookup(c, mapping);
3902 Py_ssize_t requiredsize = *outpos+1;
3903 if (res == -1)
3904 return enc_FAILED;
3905 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003906 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003907 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003908 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003909 outstart[(*outpos)++] = (char)res;
3910 return enc_SUCCESS;
3911 }
3912
3913 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003915 return enc_EXCEPTION;
3916 else if (rep==Py_None) {
3917 Py_DECREF(rep);
3918 return enc_FAILED;
3919 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003921 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003922 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003923 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003925 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003927 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3929 }
3930 else {
3931 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3933 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003934 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003935 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003936 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003937 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003939 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940 memcpy(outstart + *outpos, repchars, repsize);
3941 *outpos += repsize;
3942 }
3943 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003944 Py_DECREF(rep);
3945 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946}
3947
3948/* handle an error in PyUnicode_EncodeCharmap
3949 Return 0 on success, -1 on error */
3950static
3951int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003952 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003954 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00003955 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956{
3957 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003958 Py_ssize_t repsize;
3959 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 Py_UNICODE *uni2;
3961 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003962 Py_ssize_t collstartpos = *inpos;
3963 Py_ssize_t collendpos = *inpos+1;
3964 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 char *encoding = "charmap";
3966 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003967 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 /* find all unencodable characters */
3970 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003971 PyObject *rep;
3972 if (mapping->ob_type == &EncodingMapType) {
3973 int res = encoding_map_lookup(p[collendpos], mapping);
3974 if (res != -1)
3975 break;
3976 ++collendpos;
3977 continue;
3978 }
3979
3980 rep = charmapencode_lookup(p[collendpos], mapping);
3981 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003983 else if (rep!=Py_None) {
3984 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 break;
3986 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003987 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 ++collendpos;
3989 }
3990 /* cache callback name lookup
3991 * (if not done yet, i.e. it's the first error) */
3992 if (*known_errorHandler==-1) {
3993 if ((errors==NULL) || (!strcmp(errors, "strict")))
3994 *known_errorHandler = 1;
3995 else if (!strcmp(errors, "replace"))
3996 *known_errorHandler = 2;
3997 else if (!strcmp(errors, "ignore"))
3998 *known_errorHandler = 3;
3999 else if (!strcmp(errors, "xmlcharrefreplace"))
4000 *known_errorHandler = 4;
4001 else
4002 *known_errorHandler = 0;
4003 }
4004 switch (*known_errorHandler) {
4005 case 1: /* strict */
4006 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4007 return -1;
4008 case 2: /* replace */
4009 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4010 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004011 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 return -1;
4013 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004014 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4016 return -1;
4017 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 }
4019 /* fall through */
4020 case 3: /* ignore */
4021 *inpos = collendpos;
4022 break;
4023 case 4: /* xmlcharrefreplace */
4024 /* generate replacement (temporarily (mis)uses p) */
4025 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4026 char buffer[2+29+1+1];
4027 char *cp;
4028 sprintf(buffer, "&#%d;", (int)p[collpos]);
4029 for (cp = buffer; *cp; ++cp) {
4030 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004031 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004033 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4035 return -1;
4036 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 }
4038 }
4039 *inpos = collendpos;
4040 break;
4041 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004042 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 encoding, reason, p, size, exceptionObject,
4044 collstartpos, collendpos, &newpos);
4045 if (repunicode == NULL)
4046 return -1;
4047 /* generate replacement */
4048 repsize = PyUnicode_GET_SIZE(repunicode);
4049 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4050 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004051 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 return -1;
4053 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004054 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4057 return -1;
4058 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 }
4060 *inpos = newpos;
4061 Py_DECREF(repunicode);
4062 }
4063 return 0;
4064}
4065
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004067 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 PyObject *mapping,
4069 const char *errors)
4070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 /* output object */
4072 PyObject *res = NULL;
4073 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004074 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004076 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 PyObject *errorHandler = NULL;
4078 PyObject *exc = NULL;
4079 /* the following variable is used for caching string comparisons
4080 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4081 * 3=ignore, 4=xmlcharrefreplace */
4082 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083
4084 /* Default to Latin-1 */
4085 if (mapping == NULL)
4086 return PyUnicode_EncodeLatin1(p, size, errors);
4087
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088 /* allocate enough for a simple encoding without
4089 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004090 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004091 if (res == NULL)
4092 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004093 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004096 while (inpos<size) {
4097 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004098 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004099 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004101 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 if (charmap_encoding_error(p, size, &inpos, mapping,
4103 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004104 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004105 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004106 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109 else
4110 /* done with this character => adjust input position */
4111 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004115 if (respos<PyBytes_GET_SIZE(res)) {
4116 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 goto onError;
4118 }
4119 Py_XDECREF(exc);
4120 Py_XDECREF(errorHandler);
4121 return res;
4122
4123 onError:
4124 Py_XDECREF(res);
4125 Py_XDECREF(exc);
4126 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 return NULL;
4128}
4129
4130PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4131 PyObject *mapping)
4132{
4133 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4134 PyErr_BadArgument();
4135 return NULL;
4136 }
4137 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4138 PyUnicode_GET_SIZE(unicode),
4139 mapping,
4140 NULL);
4141}
4142
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143/* create or adjust a UnicodeTranslateError */
4144static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004145 const Py_UNICODE *unicode, Py_ssize_t size,
4146 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 if (*exceptionObject == NULL) {
4150 *exceptionObject = PyUnicodeTranslateError_Create(
4151 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152 }
4153 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4155 goto onError;
4156 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4157 goto onError;
4158 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4159 goto onError;
4160 return;
4161 onError:
4162 Py_DECREF(*exceptionObject);
4163 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 }
4165}
4166
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167/* raises a UnicodeTranslateError */
4168static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004169 const Py_UNICODE *unicode, Py_ssize_t size,
4170 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 const char *reason)
4172{
4173 make_translate_exception(exceptionObject,
4174 unicode, size, startpos, endpos, reason);
4175 if (*exceptionObject != NULL)
4176 PyCodec_StrictErrors(*exceptionObject);
4177}
4178
4179/* error handling callback helper:
4180 build arguments, call the callback and check the arguments,
4181 put the result into newpos and return the replacement string, which
4182 has to be freed by the caller */
4183static PyObject *unicode_translate_call_errorhandler(const char *errors,
4184 PyObject **errorHandler,
4185 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004186 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4187 Py_ssize_t startpos, Py_ssize_t endpos,
4188 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004190 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004192 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 PyObject *restuple;
4194 PyObject *resunicode;
4195
4196 if (*errorHandler == NULL) {
4197 *errorHandler = PyCodec_LookupError(errors);
4198 if (*errorHandler == NULL)
4199 return NULL;
4200 }
4201
4202 make_translate_exception(exceptionObject,
4203 unicode, size, startpos, endpos, reason);
4204 if (*exceptionObject == NULL)
4205 return NULL;
4206
4207 restuple = PyObject_CallFunctionObjArgs(
4208 *errorHandler, *exceptionObject, NULL);
4209 if (restuple == NULL)
4210 return NULL;
4211 if (!PyTuple_Check(restuple)) {
4212 PyErr_Format(PyExc_TypeError, &argparse[4]);
4213 Py_DECREF(restuple);
4214 return NULL;
4215 }
4216 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004217 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 Py_DECREF(restuple);
4219 return NULL;
4220 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004221 if (i_newpos<0)
4222 *newpos = size+i_newpos;
4223 else
4224 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004225 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004226 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004227 Py_DECREF(restuple);
4228 return NULL;
4229 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 Py_INCREF(resunicode);
4231 Py_DECREF(restuple);
4232 return resunicode;
4233}
4234
4235/* Lookup the character ch in the mapping and put the result in result,
4236 which must be decrefed by the caller.
4237 Return 0 on success, -1 on error */
4238static
4239int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4240{
4241 PyObject *w = PyInt_FromLong((long)c);
4242 PyObject *x;
4243
4244 if (w == NULL)
4245 return -1;
4246 x = PyObject_GetItem(mapping, w);
4247 Py_DECREF(w);
4248 if (x == NULL) {
4249 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4250 /* No mapping found means: use 1:1 mapping. */
4251 PyErr_Clear();
4252 *result = NULL;
4253 return 0;
4254 } else
4255 return -1;
4256 }
4257 else if (x == Py_None) {
4258 *result = x;
4259 return 0;
4260 }
4261 else if (PyInt_Check(x)) {
4262 long value = PyInt_AS_LONG(x);
4263 long max = PyUnicode_GetMax();
4264 if (value < 0 || value > max) {
4265 PyErr_Format(PyExc_TypeError,
4266 "character mapping must be in range(0x%lx)", max+1);
4267 Py_DECREF(x);
4268 return -1;
4269 }
4270 *result = x;
4271 return 0;
4272 }
4273 else if (PyUnicode_Check(x)) {
4274 *result = x;
4275 return 0;
4276 }
4277 else {
4278 /* wrong return value */
4279 PyErr_SetString(PyExc_TypeError,
4280 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004281 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 return -1;
4283 }
4284}
4285/* ensure that *outobj is at least requiredsize characters long,
4286if not reallocate and adjust various state variables.
4287Return 0 on success, -1 on error */
4288static
Walter Dörwald4894c302003-10-24 14:25:28 +00004289int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004290 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004292 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004293 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004295 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004297 if (requiredsize < 2 * oldsize)
4298 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004299 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 return -1;
4301 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 }
4303 return 0;
4304}
4305/* lookup the character, put the result in the output string and adjust
4306 various state variables. Return a new reference to the object that
4307 was put in the output buffer in *result, or Py_None, if the mapping was
4308 undefined (in which case no character was written).
4309 The called must decref result.
4310 Return 0 on success, -1 on error. */
4311static
Walter Dörwald4894c302003-10-24 14:25:28 +00004312int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004313 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004314 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315{
Walter Dörwald4894c302003-10-24 14:25:28 +00004316 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 return -1;
4318 if (*res==NULL) {
4319 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004320 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 }
4322 else if (*res==Py_None)
4323 ;
4324 else if (PyInt_Check(*res)) {
4325 /* no overflow check, because we know that the space is enough */
4326 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4327 }
4328 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004329 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 if (repsize==1) {
4331 /* no overflow check, because we know that the space is enough */
4332 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4333 }
4334 else if (repsize!=0) {
4335 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004336 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004337 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004338 repsize - 1;
4339 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 return -1;
4341 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4342 *outp += repsize;
4343 }
4344 }
4345 else
4346 return -1;
4347 return 0;
4348}
4349
4350PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004351 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352 PyObject *mapping,
4353 const char *errors)
4354{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 /* output object */
4356 PyObject *res = NULL;
4357 /* pointers to the beginning and end+1 of input */
4358 const Py_UNICODE *startp = p;
4359 const Py_UNICODE *endp = p + size;
4360 /* pointer into the output */
4361 Py_UNICODE *str;
4362 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004363 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 char *reason = "character maps to <undefined>";
4365 PyObject *errorHandler = NULL;
4366 PyObject *exc = NULL;
4367 /* the following variable is used for caching string comparisons
4368 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4369 * 3=ignore, 4=xmlcharrefreplace */
4370 int known_errorHandler = -1;
4371
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372 if (mapping == NULL) {
4373 PyErr_BadArgument();
4374 return NULL;
4375 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376
4377 /* allocate enough for a simple 1:1 translation without
4378 replacements, if we need more, we'll resize */
4379 res = PyUnicode_FromUnicode(NULL, size);
4380 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 return res;
4384 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 while (p<endp) {
4387 /* try to encode it */
4388 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004389 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 goto onError;
4392 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004393 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 if (x!=Py_None) /* it worked => adjust input pointer */
4395 ++p;
4396 else { /* untranslatable character */
4397 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004398 Py_ssize_t repsize;
4399 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 Py_UNICODE *uni2;
4401 /* startpos for collecting untranslatable chars */
4402 const Py_UNICODE *collstart = p;
4403 const Py_UNICODE *collend = p+1;
4404 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 /* find all untranslatable characters */
4407 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004408 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 goto onError;
4410 Py_XDECREF(x);
4411 if (x!=Py_None)
4412 break;
4413 ++collend;
4414 }
4415 /* cache callback name lookup
4416 * (if not done yet, i.e. it's the first error) */
4417 if (known_errorHandler==-1) {
4418 if ((errors==NULL) || (!strcmp(errors, "strict")))
4419 known_errorHandler = 1;
4420 else if (!strcmp(errors, "replace"))
4421 known_errorHandler = 2;
4422 else if (!strcmp(errors, "ignore"))
4423 known_errorHandler = 3;
4424 else if (!strcmp(errors, "xmlcharrefreplace"))
4425 known_errorHandler = 4;
4426 else
4427 known_errorHandler = 0;
4428 }
4429 switch (known_errorHandler) {
4430 case 1: /* strict */
4431 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4432 goto onError;
4433 case 2: /* replace */
4434 /* No need to check for space, this is a 1:1 replacement */
4435 for (coll = collstart; coll<collend; ++coll)
4436 *str++ = '?';
4437 /* fall through */
4438 case 3: /* ignore */
4439 p = collend;
4440 break;
4441 case 4: /* xmlcharrefreplace */
4442 /* generate replacement (temporarily (mis)uses p) */
4443 for (p = collstart; p < collend; ++p) {
4444 char buffer[2+29+1+1];
4445 char *cp;
4446 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004447 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4449 goto onError;
4450 for (cp = buffer; *cp; ++cp)
4451 *str++ = *cp;
4452 }
4453 p = collend;
4454 break;
4455 default:
4456 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4457 reason, startp, size, &exc,
4458 collstart-startp, collend-startp, &newpos);
4459 if (repunicode == NULL)
4460 goto onError;
4461 /* generate replacement */
4462 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004463 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4465 Py_DECREF(repunicode);
4466 goto onError;
4467 }
4468 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4469 *str++ = *uni2;
4470 p = startp + newpos;
4471 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 }
4473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 /* Resize if we allocated to much */
4476 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004477 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004478 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004479 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 }
4481 Py_XDECREF(exc);
4482 Py_XDECREF(errorHandler);
4483 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 onError:
4486 Py_XDECREF(res);
4487 Py_XDECREF(exc);
4488 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 return NULL;
4490}
4491
4492PyObject *PyUnicode_Translate(PyObject *str,
4493 PyObject *mapping,
4494 const char *errors)
4495{
4496 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004497
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 str = PyUnicode_FromObject(str);
4499 if (str == NULL)
4500 goto onError;
4501 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4502 PyUnicode_GET_SIZE(str),
4503 mapping,
4504 errors);
4505 Py_DECREF(str);
4506 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004507
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 onError:
4509 Py_XDECREF(str);
4510 return NULL;
4511}
Tim Petersced69f82003-09-16 20:30:58 +00004512
Guido van Rossum9e896b32000-04-05 20:11:21 +00004513/* --- Decimal Encoder ---------------------------------------------------- */
4514
4515int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004516 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004517 char *output,
4518 const char *errors)
4519{
4520 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 PyObject *errorHandler = NULL;
4522 PyObject *exc = NULL;
4523 const char *encoding = "decimal";
4524 const char *reason = "invalid decimal Unicode string";
4525 /* the following variable is used for caching string comparisons
4526 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4527 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004528
4529 if (output == NULL) {
4530 PyErr_BadArgument();
4531 return -1;
4532 }
4533
4534 p = s;
4535 end = s + length;
4536 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004538 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004540 Py_ssize_t repsize;
4541 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 Py_UNICODE *uni2;
4543 Py_UNICODE *collstart;
4544 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004545
Guido van Rossum9e896b32000-04-05 20:11:21 +00004546 if (Py_UNICODE_ISSPACE(ch)) {
4547 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004549 continue;
4550 }
4551 decimal = Py_UNICODE_TODECIMAL(ch);
4552 if (decimal >= 0) {
4553 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004555 continue;
4556 }
Guido van Rossumba477042000-04-06 18:18:10 +00004557 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004558 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004560 continue;
4561 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 /* All other characters are considered unencodable */
4563 collstart = p;
4564 collend = p+1;
4565 while (collend < end) {
4566 if ((0 < *collend && *collend < 256) ||
4567 !Py_UNICODE_ISSPACE(*collend) ||
4568 Py_UNICODE_TODECIMAL(*collend))
4569 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004570 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 /* cache callback name lookup
4572 * (if not done yet, i.e. it's the first error) */
4573 if (known_errorHandler==-1) {
4574 if ((errors==NULL) || (!strcmp(errors, "strict")))
4575 known_errorHandler = 1;
4576 else if (!strcmp(errors, "replace"))
4577 known_errorHandler = 2;
4578 else if (!strcmp(errors, "ignore"))
4579 known_errorHandler = 3;
4580 else if (!strcmp(errors, "xmlcharrefreplace"))
4581 known_errorHandler = 4;
4582 else
4583 known_errorHandler = 0;
4584 }
4585 switch (known_errorHandler) {
4586 case 1: /* strict */
4587 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4588 goto onError;
4589 case 2: /* replace */
4590 for (p = collstart; p < collend; ++p)
4591 *output++ = '?';
4592 /* fall through */
4593 case 3: /* ignore */
4594 p = collend;
4595 break;
4596 case 4: /* xmlcharrefreplace */
4597 /* generate replacement (temporarily (mis)uses p) */
4598 for (p = collstart; p < collend; ++p)
4599 output += sprintf(output, "&#%d;", (int)*p);
4600 p = collend;
4601 break;
4602 default:
4603 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4604 encoding, reason, s, length, &exc,
4605 collstart-s, collend-s, &newpos);
4606 if (repunicode == NULL)
4607 goto onError;
4608 /* generate replacement */
4609 repsize = PyUnicode_GET_SIZE(repunicode);
4610 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4611 Py_UNICODE ch = *uni2;
4612 if (Py_UNICODE_ISSPACE(ch))
4613 *output++ = ' ';
4614 else {
4615 decimal = Py_UNICODE_TODECIMAL(ch);
4616 if (decimal >= 0)
4617 *output++ = '0' + decimal;
4618 else if (0 < ch && ch < 256)
4619 *output++ = (char)ch;
4620 else {
4621 Py_DECREF(repunicode);
4622 raise_encode_exception(&exc, encoding,
4623 s, length, collstart-s, collend-s, reason);
4624 goto onError;
4625 }
4626 }
4627 }
4628 p = s + newpos;
4629 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004630 }
4631 }
4632 /* 0-terminate the output string */
4633 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 Py_XDECREF(exc);
4635 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004636 return 0;
4637
4638 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 Py_XDECREF(exc);
4640 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004641 return -1;
4642}
4643
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644/* --- Helpers ------------------------------------------------------------ */
4645
Thomas Wouters477c8d52006-05-27 19:21:47 +00004646#define STRINGLIB_CHAR Py_UNICODE
4647
4648#define STRINGLIB_LEN PyUnicode_GET_SIZE
4649#define STRINGLIB_NEW PyUnicode_FromUnicode
4650#define STRINGLIB_STR PyUnicode_AS_UNICODE
4651
4652Py_LOCAL_INLINE(int)
4653STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004655 if (str[0] != other[0])
4656 return 1;
4657 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658}
4659
Thomas Wouters477c8d52006-05-27 19:21:47 +00004660#define STRINGLIB_EMPTY unicode_empty
4661
4662#include "stringlib/fastsearch.h"
4663
4664#include "stringlib/count.h"
4665#include "stringlib/find.h"
4666#include "stringlib/partition.h"
4667
4668/* helper macro to fixup start/end slice values */
4669#define FIX_START_END(obj) \
4670 if (start < 0) \
4671 start += (obj)->length; \
4672 if (start < 0) \
4673 start = 0; \
4674 if (end > (obj)->length) \
4675 end = (obj)->length; \
4676 if (end < 0) \
4677 end += (obj)->length; \
4678 if (end < 0) \
4679 end = 0;
4680
Martin v. Löwis18e16552006-02-15 17:27:45 +00004681Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004682 PyObject *substr,
4683 Py_ssize_t start,
4684 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004686 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004687 PyUnicodeObject* str_obj;
4688 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004689
Thomas Wouters477c8d52006-05-27 19:21:47 +00004690 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4691 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004693 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4694 if (!sub_obj) {
4695 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 return -1;
4697 }
Tim Petersced69f82003-09-16 20:30:58 +00004698
Thomas Wouters477c8d52006-05-27 19:21:47 +00004699 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004700
Thomas Wouters477c8d52006-05-27 19:21:47 +00004701 result = stringlib_count(
4702 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4703 );
4704
4705 Py_DECREF(sub_obj);
4706 Py_DECREF(str_obj);
4707
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 return result;
4709}
4710
Martin v. Löwis18e16552006-02-15 17:27:45 +00004711Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004712 PyObject *sub,
4713 Py_ssize_t start,
4714 Py_ssize_t end,
4715 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004717 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004718
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004720 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004721 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004722 sub = PyUnicode_FromObject(sub);
4723 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004724 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004725 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 }
Tim Petersced69f82003-09-16 20:30:58 +00004727
Thomas Wouters477c8d52006-05-27 19:21:47 +00004728 if (direction > 0)
4729 result = stringlib_find_slice(
4730 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4731 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4732 start, end
4733 );
4734 else
4735 result = stringlib_rfind_slice(
4736 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4737 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4738 start, end
4739 );
4740
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004742 Py_DECREF(sub);
4743
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 return result;
4745}
4746
Tim Petersced69f82003-09-16 20:30:58 +00004747static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748int tailmatch(PyUnicodeObject *self,
4749 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004750 Py_ssize_t start,
4751 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 int direction)
4753{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 if (substring->length == 0)
4755 return 1;
4756
Thomas Wouters477c8d52006-05-27 19:21:47 +00004757 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758
4759 end -= substring->length;
4760 if (end < start)
4761 return 0;
4762
4763 if (direction > 0) {
4764 if (Py_UNICODE_MATCH(self, end, substring))
4765 return 1;
4766 } else {
4767 if (Py_UNICODE_MATCH(self, start, substring))
4768 return 1;
4769 }
4770
4771 return 0;
4772}
4773
Martin v. Löwis18e16552006-02-15 17:27:45 +00004774Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004776 Py_ssize_t start,
4777 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 int direction)
4779{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004780 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004781
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 str = PyUnicode_FromObject(str);
4783 if (str == NULL)
4784 return -1;
4785 substr = PyUnicode_FromObject(substr);
4786 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004787 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 return -1;
4789 }
Tim Petersced69f82003-09-16 20:30:58 +00004790
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 result = tailmatch((PyUnicodeObject *)str,
4792 (PyUnicodeObject *)substr,
4793 start, end, direction);
4794 Py_DECREF(str);
4795 Py_DECREF(substr);
4796 return result;
4797}
4798
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799/* Apply fixfct filter to the Unicode object self and return a
4800 reference to the modified object */
4801
Tim Petersced69f82003-09-16 20:30:58 +00004802static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803PyObject *fixup(PyUnicodeObject *self,
4804 int (*fixfct)(PyUnicodeObject *s))
4805{
4806
4807 PyUnicodeObject *u;
4808
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004809 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 if (u == NULL)
4811 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004812
4813 Py_UNICODE_COPY(u->str, self->str, self->length);
4814
Tim Peters7a29bd52001-09-12 03:03:31 +00004815 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 /* fixfct should return TRUE if it modified the buffer. If
4817 FALSE, return a reference to the original buffer instead
4818 (to save space, not time) */
4819 Py_INCREF(self);
4820 Py_DECREF(u);
4821 return (PyObject*) self;
4822 }
4823 return (PyObject*) u;
4824}
4825
Tim Petersced69f82003-09-16 20:30:58 +00004826static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827int fixupper(PyUnicodeObject *self)
4828{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004829 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 Py_UNICODE *s = self->str;
4831 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004832
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 while (len-- > 0) {
4834 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004835
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 ch = Py_UNICODE_TOUPPER(*s);
4837 if (ch != *s) {
4838 status = 1;
4839 *s = ch;
4840 }
4841 s++;
4842 }
4843
4844 return status;
4845}
4846
Tim Petersced69f82003-09-16 20:30:58 +00004847static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848int fixlower(PyUnicodeObject *self)
4849{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004850 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 Py_UNICODE *s = self->str;
4852 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004853
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 while (len-- > 0) {
4855 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004856
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 ch = Py_UNICODE_TOLOWER(*s);
4858 if (ch != *s) {
4859 status = 1;
4860 *s = ch;
4861 }
4862 s++;
4863 }
4864
4865 return status;
4866}
4867
Tim Petersced69f82003-09-16 20:30:58 +00004868static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869int fixswapcase(PyUnicodeObject *self)
4870{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004871 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 Py_UNICODE *s = self->str;
4873 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004874
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 while (len-- > 0) {
4876 if (Py_UNICODE_ISUPPER(*s)) {
4877 *s = Py_UNICODE_TOLOWER(*s);
4878 status = 1;
4879 } else if (Py_UNICODE_ISLOWER(*s)) {
4880 *s = Py_UNICODE_TOUPPER(*s);
4881 status = 1;
4882 }
4883 s++;
4884 }
4885
4886 return status;
4887}
4888
Tim Petersced69f82003-09-16 20:30:58 +00004889static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890int fixcapitalize(PyUnicodeObject *self)
4891{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004892 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004893 Py_UNICODE *s = self->str;
4894 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004895
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004896 if (len == 0)
4897 return 0;
4898 if (Py_UNICODE_ISLOWER(*s)) {
4899 *s = Py_UNICODE_TOUPPER(*s);
4900 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004902 s++;
4903 while (--len > 0) {
4904 if (Py_UNICODE_ISUPPER(*s)) {
4905 *s = Py_UNICODE_TOLOWER(*s);
4906 status = 1;
4907 }
4908 s++;
4909 }
4910 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911}
4912
4913static
4914int fixtitle(PyUnicodeObject *self)
4915{
4916 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4917 register Py_UNICODE *e;
4918 int previous_is_cased;
4919
4920 /* Shortcut for single character strings */
4921 if (PyUnicode_GET_SIZE(self) == 1) {
4922 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4923 if (*p != ch) {
4924 *p = ch;
4925 return 1;
4926 }
4927 else
4928 return 0;
4929 }
Tim Petersced69f82003-09-16 20:30:58 +00004930
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 e = p + PyUnicode_GET_SIZE(self);
4932 previous_is_cased = 0;
4933 for (; p < e; p++) {
4934 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004935
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 if (previous_is_cased)
4937 *p = Py_UNICODE_TOLOWER(ch);
4938 else
4939 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004940
4941 if (Py_UNICODE_ISLOWER(ch) ||
4942 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 Py_UNICODE_ISTITLE(ch))
4944 previous_is_cased = 1;
4945 else
4946 previous_is_cased = 0;
4947 }
4948 return 1;
4949}
4950
Tim Peters8ce9f162004-08-27 01:49:32 +00004951PyObject *
4952PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953{
Tim Peters8ce9f162004-08-27 01:49:32 +00004954 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004955 const Py_UNICODE blank = ' ';
4956 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004957 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004958 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00004959 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4960 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004961 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4962 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004963 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004964 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004965 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966
Tim Peters05eba1f2004-08-27 21:32:02 +00004967 fseq = PySequence_Fast(seq, "");
4968 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004969 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004970 }
4971
Tim Peters91879ab2004-08-27 22:35:44 +00004972 /* Grrrr. A codec may be invoked to convert str objects to
4973 * Unicode, and so it's possible to call back into Python code
4974 * during PyUnicode_FromObject(), and so it's possible for a sick
4975 * codec to change the size of fseq (if seq is a list). Therefore
4976 * we have to keep refetching the size -- can't assume seqlen
4977 * is invariant.
4978 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004979 seqlen = PySequence_Fast_GET_SIZE(fseq);
4980 /* If empty sequence, return u"". */
4981 if (seqlen == 0) {
4982 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4983 goto Done;
4984 }
4985 /* If singleton sequence with an exact Unicode, return that. */
4986 if (seqlen == 1) {
4987 item = PySequence_Fast_GET_ITEM(fseq, 0);
4988 if (PyUnicode_CheckExact(item)) {
4989 Py_INCREF(item);
4990 res = (PyUnicodeObject *)item;
4991 goto Done;
4992 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004993 }
4994
Tim Peters05eba1f2004-08-27 21:32:02 +00004995 /* At least two items to join, or one that isn't exact Unicode. */
4996 if (seqlen > 1) {
4997 /* Set up sep and seplen -- they're needed. */
4998 if (separator == NULL) {
4999 sep = &blank;
5000 seplen = 1;
5001 }
5002 else {
5003 internal_separator = PyUnicode_FromObject(separator);
5004 if (internal_separator == NULL)
5005 goto onError;
5006 sep = PyUnicode_AS_UNICODE(internal_separator);
5007 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005008 /* In case PyUnicode_FromObject() mutated seq. */
5009 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005010 }
5011 }
5012
5013 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005014 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005015 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005016 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005017 res_p = PyUnicode_AS_UNICODE(res);
5018 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005019
Tim Peters05eba1f2004-08-27 21:32:02 +00005020 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005021 Py_ssize_t itemlen;
5022 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005023
5024 item = PySequence_Fast_GET_ITEM(fseq, i);
5025 /* Convert item to Unicode. */
5026 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5027 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005028 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005029 " %.80s found",
5030 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005031 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005032 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005033 item = PyUnicode_FromObject(item);
5034 if (item == NULL)
5035 goto onError;
5036 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005037
Tim Peters91879ab2004-08-27 22:35:44 +00005038 /* In case PyUnicode_FromObject() mutated seq. */
5039 seqlen = PySequence_Fast_GET_SIZE(fseq);
5040
Tim Peters8ce9f162004-08-27 01:49:32 +00005041 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005043 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005044 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005045 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005046 if (i < seqlen - 1) {
5047 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005048 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005049 goto Overflow;
5050 }
5051 if (new_res_used > res_alloc) {
5052 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005053 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005054 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005055 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005056 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005057 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005058 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005059 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005061 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005062 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005064
5065 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005066 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005067 res_p += itemlen;
5068 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005069 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005070 res_p += seplen;
5071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005073 res_used = new_res_used;
5074 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005075
Tim Peters05eba1f2004-08-27 21:32:02 +00005076 /* Shrink res to match the used area; this probably can't fail,
5077 * but it's cheap to check.
5078 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005079 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005080 goto onError;
5081
5082 Done:
5083 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005084 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 return (PyObject *)res;
5086
Tim Peters8ce9f162004-08-27 01:49:32 +00005087 Overflow:
5088 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005089 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005090 Py_DECREF(item);
5091 /* fall through */
5092
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005094 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005095 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005096 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 return NULL;
5098}
5099
Tim Petersced69f82003-09-16 20:30:58 +00005100static
5101PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005102 Py_ssize_t left,
5103 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 Py_UNICODE fill)
5105{
5106 PyUnicodeObject *u;
5107
5108 if (left < 0)
5109 left = 0;
5110 if (right < 0)
5111 right = 0;
5112
Tim Peters7a29bd52001-09-12 03:03:31 +00005113 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 Py_INCREF(self);
5115 return self;
5116 }
5117
5118 u = _PyUnicode_New(left + self->length + right);
5119 if (u) {
5120 if (left)
5121 Py_UNICODE_FILL(u->str, fill, left);
5122 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5123 if (right)
5124 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5125 }
5126
5127 return u;
5128}
5129
5130#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005131 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 if (!str) \
5133 goto onError; \
5134 if (PyList_Append(list, str)) { \
5135 Py_DECREF(str); \
5136 goto onError; \
5137 } \
5138 else \
5139 Py_DECREF(str);
5140
5141static
5142PyObject *split_whitespace(PyUnicodeObject *self,
5143 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005144 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005146 register Py_ssize_t i;
5147 register Py_ssize_t j;
5148 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 PyObject *str;
5150
5151 for (i = j = 0; i < len; ) {
5152 /* find a token */
5153 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5154 i++;
5155 j = i;
5156 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5157 i++;
5158 if (j < i) {
5159 if (maxcount-- <= 0)
5160 break;
5161 SPLIT_APPEND(self->str, j, i);
5162 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5163 i++;
5164 j = i;
5165 }
5166 }
5167 if (j < len) {
5168 SPLIT_APPEND(self->str, j, len);
5169 }
5170 return list;
5171
5172 onError:
5173 Py_DECREF(list);
5174 return NULL;
5175}
5176
5177PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005178 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005180 register Py_ssize_t i;
5181 register Py_ssize_t j;
5182 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 PyObject *list;
5184 PyObject *str;
5185 Py_UNICODE *data;
5186
5187 string = PyUnicode_FromObject(string);
5188 if (string == NULL)
5189 return NULL;
5190 data = PyUnicode_AS_UNICODE(string);
5191 len = PyUnicode_GET_SIZE(string);
5192
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 list = PyList_New(0);
5194 if (!list)
5195 goto onError;
5196
5197 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005198 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005199
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005201 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
5204 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005205 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 if (i < len) {
5207 if (data[i] == '\r' && i + 1 < len &&
5208 data[i+1] == '\n')
5209 i += 2;
5210 else
5211 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005212 if (keepends)
5213 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 }
Guido van Rossum86662912000-04-11 15:38:46 +00005215 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 j = i;
5217 }
5218 if (j < len) {
5219 SPLIT_APPEND(data, j, len);
5220 }
5221
5222 Py_DECREF(string);
5223 return list;
5224
5225 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005226 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 Py_DECREF(string);
5228 return NULL;
5229}
5230
Tim Petersced69f82003-09-16 20:30:58 +00005231static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232PyObject *split_char(PyUnicodeObject *self,
5233 PyObject *list,
5234 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 register Py_ssize_t i;
5238 register Py_ssize_t j;
5239 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 PyObject *str;
5241
5242 for (i = j = 0; i < len; ) {
5243 if (self->str[i] == ch) {
5244 if (maxcount-- <= 0)
5245 break;
5246 SPLIT_APPEND(self->str, j, i);
5247 i = j = i + 1;
5248 } else
5249 i++;
5250 }
5251 if (j <= len) {
5252 SPLIT_APPEND(self->str, j, len);
5253 }
5254 return list;
5255
5256 onError:
5257 Py_DECREF(list);
5258 return NULL;
5259}
5260
Tim Petersced69f82003-09-16 20:30:58 +00005261static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262PyObject *split_substring(PyUnicodeObject *self,
5263 PyObject *list,
5264 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005265 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 register Py_ssize_t i;
5268 register Py_ssize_t j;
5269 Py_ssize_t len = self->length;
5270 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 PyObject *str;
5272
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005273 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 if (Py_UNICODE_MATCH(self, i, substring)) {
5275 if (maxcount-- <= 0)
5276 break;
5277 SPLIT_APPEND(self->str, j, i);
5278 i = j = i + sublen;
5279 } else
5280 i++;
5281 }
5282 if (j <= len) {
5283 SPLIT_APPEND(self->str, j, len);
5284 }
5285 return list;
5286
5287 onError:
5288 Py_DECREF(list);
5289 return NULL;
5290}
5291
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005292static
5293PyObject *rsplit_whitespace(PyUnicodeObject *self,
5294 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005295 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005296{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 register Py_ssize_t i;
5298 register Py_ssize_t j;
5299 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005300 PyObject *str;
5301
5302 for (i = j = len - 1; i >= 0; ) {
5303 /* find a token */
5304 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5305 i--;
5306 j = i;
5307 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5308 i--;
5309 if (j > i) {
5310 if (maxcount-- <= 0)
5311 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005312 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005313 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5314 i--;
5315 j = i;
5316 }
5317 }
5318 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005319 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005320 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005321 if (PyList_Reverse(list) < 0)
5322 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005323 return list;
5324
5325 onError:
5326 Py_DECREF(list);
5327 return NULL;
5328}
5329
5330static
5331PyObject *rsplit_char(PyUnicodeObject *self,
5332 PyObject *list,
5333 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005334 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336 register Py_ssize_t i;
5337 register Py_ssize_t j;
5338 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005339 PyObject *str;
5340
5341 for (i = j = len - 1; i >= 0; ) {
5342 if (self->str[i] == ch) {
5343 if (maxcount-- <= 0)
5344 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005345 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005346 j = i = i - 1;
5347 } else
5348 i--;
5349 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005350 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005351 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005352 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005353 if (PyList_Reverse(list) < 0)
5354 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005355 return list;
5356
5357 onError:
5358 Py_DECREF(list);
5359 return NULL;
5360}
5361
5362static
5363PyObject *rsplit_substring(PyUnicodeObject *self,
5364 PyObject *list,
5365 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005366 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005367{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005368 register Py_ssize_t i;
5369 register Py_ssize_t j;
5370 Py_ssize_t len = self->length;
5371 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005372 PyObject *str;
5373
5374 for (i = len - sublen, j = len; i >= 0; ) {
5375 if (Py_UNICODE_MATCH(self, i, substring)) {
5376 if (maxcount-- <= 0)
5377 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005378 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005379 j = i;
5380 i -= sublen;
5381 } else
5382 i--;
5383 }
5384 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005385 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005386 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005387 if (PyList_Reverse(list) < 0)
5388 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005389 return list;
5390
5391 onError:
5392 Py_DECREF(list);
5393 return NULL;
5394}
5395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396#undef SPLIT_APPEND
5397
5398static
5399PyObject *split(PyUnicodeObject *self,
5400 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402{
5403 PyObject *list;
5404
5405 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005406 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407
5408 list = PyList_New(0);
5409 if (!list)
5410 return NULL;
5411
5412 if (substring == NULL)
5413 return split_whitespace(self,list,maxcount);
5414
5415 else if (substring->length == 1)
5416 return split_char(self,list,substring->str[0],maxcount);
5417
5418 else if (substring->length == 0) {
5419 Py_DECREF(list);
5420 PyErr_SetString(PyExc_ValueError, "empty separator");
5421 return NULL;
5422 }
5423 else
5424 return split_substring(self,list,substring,maxcount);
5425}
5426
Tim Petersced69f82003-09-16 20:30:58 +00005427static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005428PyObject *rsplit(PyUnicodeObject *self,
5429 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005430 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005431{
5432 PyObject *list;
5433
5434 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005435 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005436
5437 list = PyList_New(0);
5438 if (!list)
5439 return NULL;
5440
5441 if (substring == NULL)
5442 return rsplit_whitespace(self,list,maxcount);
5443
5444 else if (substring->length == 1)
5445 return rsplit_char(self,list,substring->str[0],maxcount);
5446
5447 else if (substring->length == 0) {
5448 Py_DECREF(list);
5449 PyErr_SetString(PyExc_ValueError, "empty separator");
5450 return NULL;
5451 }
5452 else
5453 return rsplit_substring(self,list,substring,maxcount);
5454}
5455
5456static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457PyObject *replace(PyUnicodeObject *self,
5458 PyUnicodeObject *str1,
5459 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005460 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461{
5462 PyUnicodeObject *u;
5463
5464 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005465 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466
Thomas Wouters477c8d52006-05-27 19:21:47 +00005467 if (str1->length == str2->length) {
5468 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005469 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005470 if (str1->length == 1) {
5471 /* replace characters */
5472 Py_UNICODE u1, u2;
5473 if (!findchar(self->str, self->length, str1->str[0]))
5474 goto nothing;
5475 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5476 if (!u)
5477 return NULL;
5478 Py_UNICODE_COPY(u->str, self->str, self->length);
5479 u1 = str1->str[0];
5480 u2 = str2->str[0];
5481 for (i = 0; i < u->length; i++)
5482 if (u->str[i] == u1) {
5483 if (--maxcount < 0)
5484 break;
5485 u->str[i] = u2;
5486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005488 i = fastsearch(
5489 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005491 if (i < 0)
5492 goto nothing;
5493 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5494 if (!u)
5495 return NULL;
5496 Py_UNICODE_COPY(u->str, self->str, self->length);
5497 while (i <= self->length - str1->length)
5498 if (Py_UNICODE_MATCH(self, i, str1)) {
5499 if (--maxcount < 0)
5500 break;
5501 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5502 i += str1->length;
5503 } else
5504 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005507
5508 Py_ssize_t n, i, j, e;
5509 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 Py_UNICODE *p;
5511
5512 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005513 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 if (n > maxcount)
5515 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005516 if (n == 0)
5517 goto nothing;
5518 /* new_size = self->length + n * (str2->length - str1->length)); */
5519 delta = (str2->length - str1->length);
5520 if (delta == 0) {
5521 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005523 product = n * (str2->length - str1->length);
5524 if ((product / (str2->length - str1->length)) != n) {
5525 PyErr_SetString(PyExc_OverflowError,
5526 "replace string is too long");
5527 return NULL;
5528 }
5529 new_size = self->length + product;
5530 if (new_size < 0) {
5531 PyErr_SetString(PyExc_OverflowError,
5532 "replace string is too long");
5533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 }
5535 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005536 u = _PyUnicode_New(new_size);
5537 if (!u)
5538 return NULL;
5539 i = 0;
5540 p = u->str;
5541 e = self->length - str1->length;
5542 if (str1->length > 0) {
5543 while (n-- > 0) {
5544 /* look for next match */
5545 j = i;
5546 while (j <= e) {
5547 if (Py_UNICODE_MATCH(self, j, str1))
5548 break;
5549 j++;
5550 }
5551 if (j > i) {
5552 if (j > e)
5553 break;
5554 /* copy unchanged part [i:j] */
5555 Py_UNICODE_COPY(p, self->str+i, j-i);
5556 p += j - i;
5557 }
5558 /* copy substitution string */
5559 if (str2->length > 0) {
5560 Py_UNICODE_COPY(p, str2->str, str2->length);
5561 p += str2->length;
5562 }
5563 i = j + str1->length;
5564 }
5565 if (i < self->length)
5566 /* copy tail [i:] */
5567 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5568 } else {
5569 /* interleave */
5570 while (n > 0) {
5571 Py_UNICODE_COPY(p, str2->str, str2->length);
5572 p += str2->length;
5573 if (--n <= 0)
5574 break;
5575 *p++ = self->str[i++];
5576 }
5577 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005581
5582nothing:
5583 /* nothing to replace; return original string (when possible) */
5584 if (PyUnicode_CheckExact(self)) {
5585 Py_INCREF(self);
5586 return (PyObject *) self;
5587 }
5588 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589}
5590
5591/* --- Unicode Object Methods --------------------------------------------- */
5592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005593PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594"S.title() -> unicode\n\
5595\n\
5596Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005597characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598
5599static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005600unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 return fixup(self, fixtitle);
5603}
5604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005605PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606"S.capitalize() -> unicode\n\
5607\n\
5608Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005609have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
5611static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005612unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 return fixup(self, fixcapitalize);
5615}
5616
5617#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005618PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619"S.capwords() -> unicode\n\
5620\n\
5621Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005622normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623
5624static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005625unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626{
5627 PyObject *list;
5628 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005629 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 /* Split into words */
5632 list = split(self, NULL, -1);
5633 if (!list)
5634 return NULL;
5635
5636 /* Capitalize each word */
5637 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5638 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5639 fixcapitalize);
5640 if (item == NULL)
5641 goto onError;
5642 Py_DECREF(PyList_GET_ITEM(list, i));
5643 PyList_SET_ITEM(list, i, item);
5644 }
5645
5646 /* Join the words to form a new string */
5647 item = PyUnicode_Join(NULL, list);
5648
5649onError:
5650 Py_DECREF(list);
5651 return (PyObject *)item;
5652}
5653#endif
5654
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005655/* Argument converter. Coerces to a single unicode character */
5656
5657static int
5658convert_uc(PyObject *obj, void *addr)
5659{
5660 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5661 PyObject *uniobj;
5662 Py_UNICODE *unistr;
5663
5664 uniobj = PyUnicode_FromObject(obj);
5665 if (uniobj == NULL) {
5666 PyErr_SetString(PyExc_TypeError,
5667 "The fill character cannot be converted to Unicode");
5668 return 0;
5669 }
5670 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5671 PyErr_SetString(PyExc_TypeError,
5672 "The fill character must be exactly one character long");
5673 Py_DECREF(uniobj);
5674 return 0;
5675 }
5676 unistr = PyUnicode_AS_UNICODE(uniobj);
5677 *fillcharloc = unistr[0];
5678 Py_DECREF(uniobj);
5679 return 1;
5680}
5681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005682PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005683"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005685Return S centered in a Unicode string of length width. Padding is\n\
5686done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
5688static PyObject *
5689unicode_center(PyUnicodeObject *self, PyObject *args)
5690{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005691 Py_ssize_t marg, left;
5692 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005693 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694
Thomas Woutersde017742006-02-16 19:34:37 +00005695 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 return NULL;
5697
Tim Peters7a29bd52001-09-12 03:03:31 +00005698 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 Py_INCREF(self);
5700 return (PyObject*) self;
5701 }
5702
5703 marg = width - self->length;
5704 left = marg / 2 + (marg & width & 1);
5705
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005706 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707}
5708
Marc-André Lemburge5034372000-08-08 08:04:29 +00005709#if 0
5710
5711/* This code should go into some future Unicode collation support
5712 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005713 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005714
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005715/* speedy UTF-16 code point order comparison */
5716/* gleaned from: */
5717/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5718
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005719static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005720{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005721 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005722 0, 0, 0, 0, 0, 0, 0, 0,
5723 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005724 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005725};
5726
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727static int
5728unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5729{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005730 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 Py_UNICODE *s1 = str1->str;
5733 Py_UNICODE *s2 = str2->str;
5734
5735 len1 = str1->length;
5736 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005737
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005739 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005740
5741 c1 = *s1++;
5742 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005743
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005744 if (c1 > (1<<11) * 26)
5745 c1 += utf16Fixup[c1>>11];
5746 if (c2 > (1<<11) * 26)
5747 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005748 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005749
5750 if (c1 != c2)
5751 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005752
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005753 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 }
5755
5756 return (len1 < len2) ? -1 : (len1 != len2);
5757}
5758
Marc-André Lemburge5034372000-08-08 08:04:29 +00005759#else
5760
5761static int
5762unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5763{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005764 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005765
5766 Py_UNICODE *s1 = str1->str;
5767 Py_UNICODE *s2 = str2->str;
5768
5769 len1 = str1->length;
5770 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005771
Marc-André Lemburge5034372000-08-08 08:04:29 +00005772 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005773 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005774
Fredrik Lundh45714e92001-06-26 16:39:36 +00005775 c1 = *s1++;
5776 c2 = *s2++;
5777
5778 if (c1 != c2)
5779 return (c1 < c2) ? -1 : 1;
5780
Marc-André Lemburge5034372000-08-08 08:04:29 +00005781 len1--; len2--;
5782 }
5783
5784 return (len1 < len2) ? -1 : (len1 != len2);
5785}
5786
5787#endif
5788
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789int PyUnicode_Compare(PyObject *left,
5790 PyObject *right)
5791{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005792 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5793 return unicode_compare((PyUnicodeObject *)left,
5794 (PyUnicodeObject *)right);
5795 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5796 (PyUnicode_Check(left) && PyString_Check(right))) {
5797 if (PyUnicode_Check(left))
5798 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5799 if (PyUnicode_Check(right))
5800 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5801 assert(PyString_Check(left));
5802 assert(PyString_Check(right));
5803 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005805 PyErr_Format(PyExc_TypeError,
5806 "Can't compare %.100s and %.100s",
5807 left->ob_type->tp_name,
5808 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809 return -1;
5810}
5811
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005812PyObject *PyUnicode_RichCompare(PyObject *left,
5813 PyObject *right,
5814 int op)
5815{
5816 int result;
5817
5818 result = PyUnicode_Compare(left, right);
5819 if (result == -1 && PyErr_Occurred())
5820 goto onError;
5821
5822 /* Convert the return value to a Boolean */
5823 switch (op) {
5824 case Py_EQ:
5825 result = (result == 0);
5826 break;
5827 case Py_NE:
5828 result = (result != 0);
5829 break;
5830 case Py_LE:
5831 result = (result <= 0);
5832 break;
5833 case Py_GE:
5834 result = (result >= 0);
5835 break;
5836 case Py_LT:
5837 result = (result == -1);
5838 break;
5839 case Py_GT:
5840 result = (result == 1);
5841 break;
5842 }
5843 return PyBool_FromLong(result);
5844
5845 onError:
5846
5847 /* Standard case
5848
5849 Type errors mean that PyUnicode_FromObject() could not convert
5850 one of the arguments (usually the right hand side) to Unicode,
5851 ie. we can't handle the comparison request. However, it is
5852 possible that the other object knows a comparison method, which
5853 is why we return Py_NotImplemented to give the other object a
5854 chance.
5855
5856 */
5857 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5858 PyErr_Clear();
5859 Py_INCREF(Py_NotImplemented);
5860 return Py_NotImplemented;
5861 }
5862 if (op != Py_EQ && op != Py_NE)
5863 return NULL;
5864
5865 /* Equality comparison.
5866
5867 This is a special case: we silence any PyExc_UnicodeDecodeError
5868 and instead turn it into a PyErr_UnicodeWarning.
5869
5870 */
5871 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5872 return NULL;
5873 PyErr_Clear();
5874 if (PyErr_Warn(PyExc_UnicodeWarning,
5875 (op == Py_EQ) ?
5876 "Unicode equal comparison "
5877 "failed to convert both arguments to Unicode - "
5878 "interpreting them as being unequal" :
5879 "Unicode unequal comparison "
5880 "failed to convert both arguments to Unicode - "
5881 "interpreting them as being unequal"
5882 ) < 0)
5883 return NULL;
5884 result = (op == Py_NE);
5885 return PyBool_FromLong(result);
5886}
5887
Guido van Rossum403d68b2000-03-13 15:55:09 +00005888int PyUnicode_Contains(PyObject *container,
5889 PyObject *element)
5890{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005891 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005892 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005893
5894 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005895 sub = PyUnicode_FromObject(element);
5896 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005897 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005898 "'in <string>' requires string as left operand");
Thomas Wouters477c8d52006-05-27 19:21:47 +00005899 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005900 }
5901
Thomas Wouters477c8d52006-05-27 19:21:47 +00005902 str = PyUnicode_FromObject(container);
5903 if (!str) {
5904 Py_DECREF(sub);
5905 return -1;
5906 }
5907
5908 result = stringlib_contains_obj(str, sub);
5909
5910 Py_DECREF(str);
5911 Py_DECREF(sub);
5912
Guido van Rossum403d68b2000-03-13 15:55:09 +00005913 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005914}
5915
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916/* Concat to string or Unicode object giving a new Unicode object. */
5917
5918PyObject *PyUnicode_Concat(PyObject *left,
5919 PyObject *right)
5920{
5921 PyUnicodeObject *u = NULL, *v = NULL, *w;
5922
Guido van Rossum84d79dd2007-04-13 02:23:57 +00005923 if (PyBytes_Check(left) || PyBytes_Check(right))
5924 return PyBytes_Concat(left, right);
5925
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 /* Coerce the two arguments */
5927 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5928 if (u == NULL)
5929 goto onError;
5930 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5931 if (v == NULL)
5932 goto onError;
5933
5934 /* Shortcuts */
5935 if (v == unicode_empty) {
5936 Py_DECREF(v);
5937 return (PyObject *)u;
5938 }
5939 if (u == unicode_empty) {
5940 Py_DECREF(u);
5941 return (PyObject *)v;
5942 }
5943
5944 /* Concat the two Unicode strings */
5945 w = _PyUnicode_New(u->length + v->length);
5946 if (w == NULL)
5947 goto onError;
5948 Py_UNICODE_COPY(w->str, u->str, u->length);
5949 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5950
5951 Py_DECREF(u);
5952 Py_DECREF(v);
5953 return (PyObject *)w;
5954
5955onError:
5956 Py_XDECREF(u);
5957 Py_XDECREF(v);
5958 return NULL;
5959}
5960
Walter Dörwald1ab83302007-05-18 17:15:44 +00005961void
5962PyUnicode_Append(PyObject **pleft, PyObject *right)
5963{
5964 PyObject *new;
5965 if (*pleft == NULL)
5966 return;
5967 if (right == NULL || !PyUnicode_Check(*pleft)) {
5968 Py_DECREF(*pleft);
5969 *pleft = NULL;
5970 return;
5971 }
5972 new = PyUnicode_Concat(*pleft, right);
5973 Py_DECREF(*pleft);
5974 *pleft = new;
5975}
5976
5977void
5978PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
5979{
5980 PyUnicode_Append(pleft, right);
5981 Py_XDECREF(right);
5982}
5983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005984PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985"S.count(sub[, start[, end]]) -> int\n\
5986\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00005987Return the number of non-overlapping occurrences of substring sub in\n\
5988Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005989interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
5991static PyObject *
5992unicode_count(PyUnicodeObject *self, PyObject *args)
5993{
5994 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005995 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005996 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 PyObject *result;
5998
Guido van Rossumb8872e62000-05-09 14:14:27 +00005999 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6000 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 return NULL;
6002
6003 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006004 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 if (substring == NULL)
6006 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006007
Thomas Wouters477c8d52006-05-27 19:21:47 +00006008 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
Thomas Wouters477c8d52006-05-27 19:21:47 +00006010 result = PyInt_FromSsize_t(
6011 stringlib_count(self->str + start, end - start,
6012 substring->str, substring->length)
6013 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014
6015 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006016
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 return result;
6018}
6019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006020PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006021"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006023Encodes S using the codec registered for encoding. encoding defaults\n\
6024to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006025handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6027'xmlcharrefreplace' as well as any other name registered with\n\
6028codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
6030static PyObject *
6031unicode_encode(PyUnicodeObject *self, PyObject *args)
6032{
6033 char *encoding = NULL;
6034 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006035 PyObject *v;
6036
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6038 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006039 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006040 if (v == NULL)
6041 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006042 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006043 if (PyString_Check(v)) {
6044 /* Old codec, turn it into bytes */
6045 PyObject *b = PyBytes_FromObject(v);
6046 Py_DECREF(v);
6047 return b;
6048 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006049 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006050 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006051 "(type=%.400s)",
6052 v->ob_type->tp_name);
6053 Py_DECREF(v);
6054 return NULL;
6055 }
6056 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006057
6058 onError:
6059 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006060}
6061
6062PyDoc_STRVAR(decode__doc__,
6063"S.decode([encoding[,errors]]) -> string or unicode\n\
6064\n\
6065Decodes S using the codec registered for encoding. encoding defaults\n\
6066to the default encoding. errors may be given to set a different error\n\
6067handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6068a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6069as well as any other name registerd with codecs.register_error that is\n\
6070able to handle UnicodeDecodeErrors.");
6071
6072static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006073unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006074{
6075 char *encoding = NULL;
6076 char *errors = NULL;
6077 PyObject *v;
6078
6079 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6080 return NULL;
6081 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006082 if (v == NULL)
6083 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006084 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6085 PyErr_Format(PyExc_TypeError,
6086 "decoder did not return a string/unicode object "
6087 "(type=%.400s)",
6088 v->ob_type->tp_name);
6089 Py_DECREF(v);
6090 return NULL;
6091 }
6092 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006093
6094 onError:
6095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096}
6097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006098PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099"S.expandtabs([tabsize]) -> unicode\n\
6100\n\
6101Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006102If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
6104static PyObject*
6105unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6106{
6107 Py_UNICODE *e;
6108 Py_UNICODE *p;
6109 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006110 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 PyUnicodeObject *u;
6112 int tabsize = 8;
6113
6114 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6115 return NULL;
6116
Thomas Wouters7e474022000-07-16 12:04:32 +00006117 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 i = j = 0;
6119 e = self->str + self->length;
6120 for (p = self->str; p < e; p++)
6121 if (*p == '\t') {
6122 if (tabsize > 0)
6123 j += tabsize - (j % tabsize);
6124 }
6125 else {
6126 j++;
6127 if (*p == '\n' || *p == '\r') {
6128 i += j;
6129 j = 0;
6130 }
6131 }
6132
6133 /* Second pass: create output string and fill it */
6134 u = _PyUnicode_New(i + j);
6135 if (!u)
6136 return NULL;
6137
6138 j = 0;
6139 q = u->str;
6140
6141 for (p = self->str; p < e; p++)
6142 if (*p == '\t') {
6143 if (tabsize > 0) {
6144 i = tabsize - (j % tabsize);
6145 j += i;
6146 while (i--)
6147 *q++ = ' ';
6148 }
6149 }
6150 else {
6151 j++;
6152 *q++ = *p;
6153 if (*p == '\n' || *p == '\r')
6154 j = 0;
6155 }
6156
6157 return (PyObject*) u;
6158}
6159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006160PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161"S.find(sub [,start [,end]]) -> int\n\
6162\n\
6163Return the lowest index in S where substring sub is found,\n\
6164such that sub is contained within s[start,end]. Optional\n\
6165arguments start and end are interpreted as in slice notation.\n\
6166\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006167Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
6169static PyObject *
6170unicode_find(PyUnicodeObject *self, PyObject *args)
6171{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006172 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006173 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006174 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006175 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176
Guido van Rossumb8872e62000-05-09 14:14:27 +00006177 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6178 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006180 substring = PyUnicode_FromObject(substring);
6181 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 return NULL;
6183
Thomas Wouters477c8d52006-05-27 19:21:47 +00006184 result = stringlib_find_slice(
6185 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6186 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6187 start, end
6188 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
6190 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006191
6192 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193}
6194
6195static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006196unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197{
6198 if (index < 0 || index >= self->length) {
6199 PyErr_SetString(PyExc_IndexError, "string index out of range");
6200 return NULL;
6201 }
6202
6203 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6204}
6205
6206static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006207unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006209 /* Since Unicode objects compare equal to their UTF-8 string
6210 counterparts, we hash the UTF-8 string. */
6211 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6212 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006215PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216"S.index(sub [,start [,end]]) -> int\n\
6217\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006218Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219
6220static PyObject *
6221unicode_index(PyUnicodeObject *self, PyObject *args)
6222{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006223 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006224 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006225 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006226 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227
Guido van Rossumb8872e62000-05-09 14:14:27 +00006228 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6229 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006231 substring = PyUnicode_FromObject(substring);
6232 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 return NULL;
6234
Thomas Wouters477c8d52006-05-27 19:21:47 +00006235 result = stringlib_find_slice(
6236 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6237 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6238 start, end
6239 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240
6241 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006242
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 if (result < 0) {
6244 PyErr_SetString(PyExc_ValueError, "substring not found");
6245 return NULL;
6246 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006247
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249}
6250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006251PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006252"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006254Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006255at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256
6257static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006258unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259{
6260 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6261 register const Py_UNICODE *e;
6262 int cased;
6263
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 /* Shortcut for single character strings */
6265 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006266 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006268 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006269 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006270 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006271
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 e = p + PyUnicode_GET_SIZE(self);
6273 cased = 0;
6274 for (; p < e; p++) {
6275 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006276
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006278 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 else if (!cased && Py_UNICODE_ISLOWER(ch))
6280 cased = 1;
6281 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006282 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283}
6284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006285PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006286"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006288Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006289at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290
6291static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006292unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
6294 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6295 register const Py_UNICODE *e;
6296 int cased;
6297
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 /* Shortcut for single character strings */
6299 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006300 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006302 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006303 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006304 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 e = p + PyUnicode_GET_SIZE(self);
6307 cased = 0;
6308 for (; p < e; p++) {
6309 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006310
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006312 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 else if (!cased && Py_UNICODE_ISUPPER(ch))
6314 cased = 1;
6315 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006316 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317}
6318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006319PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006320"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006322Return True if S is a titlecased string and there is at least one\n\
6323character in S, i.e. upper- and titlecase characters may only\n\
6324follow uncased characters and lowercase characters only cased ones.\n\
6325Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
6327static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006328unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329{
6330 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6331 register const Py_UNICODE *e;
6332 int cased, previous_is_cased;
6333
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334 /* Shortcut for single character strings */
6335 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006336 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6337 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006339 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006340 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006341 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006342
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 e = p + PyUnicode_GET_SIZE(self);
6344 cased = 0;
6345 previous_is_cased = 0;
6346 for (; p < e; p++) {
6347 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006348
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6350 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006351 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 previous_is_cased = 1;
6353 cased = 1;
6354 }
6355 else if (Py_UNICODE_ISLOWER(ch)) {
6356 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006357 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 previous_is_cased = 1;
6359 cased = 1;
6360 }
6361 else
6362 previous_is_cased = 0;
6363 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006364 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365}
6366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006367PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006368"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006370Return True if all characters in S are whitespace\n\
6371and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
6373static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006374unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
6376 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6377 register const Py_UNICODE *e;
6378
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 /* Shortcut for single character strings */
6380 if (PyUnicode_GET_SIZE(self) == 1 &&
6381 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006382 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006384 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006385 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006386 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 e = p + PyUnicode_GET_SIZE(self);
6389 for (; p < e; p++) {
6390 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006391 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006393 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394}
6395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006396PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006397"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006398\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006399Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006400and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006401
6402static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006403unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006404{
6405 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6406 register const Py_UNICODE *e;
6407
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006408 /* Shortcut for single character strings */
6409 if (PyUnicode_GET_SIZE(self) == 1 &&
6410 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006411 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006412
6413 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006414 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006415 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006416
6417 e = p + PyUnicode_GET_SIZE(self);
6418 for (; p < e; p++) {
6419 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006420 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006421 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006422 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006423}
6424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006425PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006426"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006427\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006428Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006429and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006430
6431static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006432unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006433{
6434 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6435 register const Py_UNICODE *e;
6436
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006437 /* Shortcut for single character strings */
6438 if (PyUnicode_GET_SIZE(self) == 1 &&
6439 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006440 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006441
6442 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006443 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006444 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006445
6446 e = p + PyUnicode_GET_SIZE(self);
6447 for (; p < e; p++) {
6448 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006449 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006450 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006451 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006452}
6453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006454PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006455"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006457Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006458False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
6460static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006461unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462{
6463 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6464 register const Py_UNICODE *e;
6465
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 /* Shortcut for single character strings */
6467 if (PyUnicode_GET_SIZE(self) == 1 &&
6468 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006469 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006471 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006472 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006473 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006474
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 e = p + PyUnicode_GET_SIZE(self);
6476 for (; p < e; p++) {
6477 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006478 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006480 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481}
6482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006483PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006484"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006486Return True if all characters in S are digits\n\
6487and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488
6489static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006490unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491{
6492 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6493 register const Py_UNICODE *e;
6494
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 /* Shortcut for single character strings */
6496 if (PyUnicode_GET_SIZE(self) == 1 &&
6497 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006498 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006500 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006501 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006502 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006503
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 e = p + PyUnicode_GET_SIZE(self);
6505 for (; p < e; p++) {
6506 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006507 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006509 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510}
6511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006512PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006513"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006515Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006516False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
6518static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006519unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520{
6521 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6522 register const Py_UNICODE *e;
6523
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 /* Shortcut for single character strings */
6525 if (PyUnicode_GET_SIZE(self) == 1 &&
6526 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006527 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006529 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006530 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006531 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 e = p + PyUnicode_GET_SIZE(self);
6534 for (; p < e; p++) {
6535 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006536 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006538 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539}
6540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006541PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542"S.join(sequence) -> unicode\n\
6543\n\
6544Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006545sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
6547static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006548unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006550 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551}
6552
Martin v. Löwis18e16552006-02-15 17:27:45 +00006553static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554unicode_length(PyUnicodeObject *self)
6555{
6556 return self->length;
6557}
6558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006559PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006560"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561\n\
6562Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006563done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564
6565static PyObject *
6566unicode_ljust(PyUnicodeObject *self, PyObject *args)
6567{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006568 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006569 Py_UNICODE fillchar = ' ';
6570
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006571 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 return NULL;
6573
Tim Peters7a29bd52001-09-12 03:03:31 +00006574 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 Py_INCREF(self);
6576 return (PyObject*) self;
6577 }
6578
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006579 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580}
6581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006582PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583"S.lower() -> unicode\n\
6584\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006585Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
6587static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006588unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 return fixup(self, fixlower);
6591}
6592
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006593#define LEFTSTRIP 0
6594#define RIGHTSTRIP 1
6595#define BOTHSTRIP 2
6596
6597/* Arrays indexed by above */
6598static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6599
6600#define STRIPNAME(i) (stripformat[i]+3)
6601
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006602/* externally visible for str.strip(unicode) */
6603PyObject *
6604_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6605{
6606 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006607 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006608 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006609 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6610 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006611
Thomas Wouters477c8d52006-05-27 19:21:47 +00006612 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6613
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006614 i = 0;
6615 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006616 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6617 i++;
6618 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006619 }
6620
6621 j = len;
6622 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006623 do {
6624 j--;
6625 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6626 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006627 }
6628
6629 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006630 Py_INCREF(self);
6631 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006632 }
6633 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006634 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006635}
6636
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
6638static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006639do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006641 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006642 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006643
6644 i = 0;
6645 if (striptype != RIGHTSTRIP) {
6646 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6647 i++;
6648 }
6649 }
6650
6651 j = len;
6652 if (striptype != LEFTSTRIP) {
6653 do {
6654 j--;
6655 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6656 j++;
6657 }
6658
6659 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6660 Py_INCREF(self);
6661 return (PyObject*)self;
6662 }
6663 else
6664 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665}
6666
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006667
6668static PyObject *
6669do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6670{
6671 PyObject *sep = NULL;
6672
6673 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6674 return NULL;
6675
6676 if (sep != NULL && sep != Py_None) {
6677 if (PyUnicode_Check(sep))
6678 return _PyUnicode_XStrip(self, striptype, sep);
6679 else if (PyString_Check(sep)) {
6680 PyObject *res;
6681 sep = PyUnicode_FromObject(sep);
6682 if (sep==NULL)
6683 return NULL;
6684 res = _PyUnicode_XStrip(self, striptype, sep);
6685 Py_DECREF(sep);
6686 return res;
6687 }
6688 else {
6689 PyErr_Format(PyExc_TypeError,
6690 "%s arg must be None, unicode or str",
6691 STRIPNAME(striptype));
6692 return NULL;
6693 }
6694 }
6695
6696 return do_strip(self, striptype);
6697}
6698
6699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006700PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006701"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006702\n\
6703Return a copy of the string S with leading and trailing\n\
6704whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006705If chars is given and not None, remove characters in chars instead.\n\
6706If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006707
6708static PyObject *
6709unicode_strip(PyUnicodeObject *self, PyObject *args)
6710{
6711 if (PyTuple_GET_SIZE(args) == 0)
6712 return do_strip(self, BOTHSTRIP); /* Common case */
6713 else
6714 return do_argstrip(self, BOTHSTRIP, args);
6715}
6716
6717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006718PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006719"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006720\n\
6721Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006722If chars is given and not None, remove characters in chars instead.\n\
6723If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006724
6725static PyObject *
6726unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6727{
6728 if (PyTuple_GET_SIZE(args) == 0)
6729 return do_strip(self, LEFTSTRIP); /* Common case */
6730 else
6731 return do_argstrip(self, LEFTSTRIP, args);
6732}
6733
6734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006735PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006736"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006737\n\
6738Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006739If chars is given and not None, remove characters in chars instead.\n\
6740If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006741
6742static PyObject *
6743unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6744{
6745 if (PyTuple_GET_SIZE(args) == 0)
6746 return do_strip(self, RIGHTSTRIP); /* Common case */
6747 else
6748 return do_argstrip(self, RIGHTSTRIP, args);
6749}
6750
6751
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006753unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754{
6755 PyUnicodeObject *u;
6756 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006757 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006758 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
6760 if (len < 0)
6761 len = 0;
6762
Tim Peters7a29bd52001-09-12 03:03:31 +00006763 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 /* no repeat, return original string */
6765 Py_INCREF(str);
6766 return (PyObject*) str;
6767 }
Tim Peters8f422462000-09-09 06:13:41 +00006768
6769 /* ensure # of chars needed doesn't overflow int and # of bytes
6770 * needed doesn't overflow size_t
6771 */
6772 nchars = len * str->length;
6773 if (len && nchars / len != str->length) {
6774 PyErr_SetString(PyExc_OverflowError,
6775 "repeated string is too long");
6776 return NULL;
6777 }
6778 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6779 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6780 PyErr_SetString(PyExc_OverflowError,
6781 "repeated string is too long");
6782 return NULL;
6783 }
6784 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 if (!u)
6786 return NULL;
6787
6788 p = u->str;
6789
Thomas Wouters477c8d52006-05-27 19:21:47 +00006790 if (str->length == 1 && len > 0) {
6791 Py_UNICODE_FILL(p, str->str[0], len);
6792 } else {
6793 Py_ssize_t done = 0; /* number of characters copied this far */
6794 if (done < nchars) {
6795 Py_UNICODE_COPY(p, str->str, str->length);
6796 done = str->length;
6797 }
6798 while (done < nchars) {
6799 int n = (done <= nchars-done) ? done : nchars-done;
6800 Py_UNICODE_COPY(p+done, p, n);
6801 done += n;
6802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 }
6804
6805 return (PyObject*) u;
6806}
6807
6808PyObject *PyUnicode_Replace(PyObject *obj,
6809 PyObject *subobj,
6810 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006811 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812{
6813 PyObject *self;
6814 PyObject *str1;
6815 PyObject *str2;
6816 PyObject *result;
6817
6818 self = PyUnicode_FromObject(obj);
6819 if (self == NULL)
6820 return NULL;
6821 str1 = PyUnicode_FromObject(subobj);
6822 if (str1 == NULL) {
6823 Py_DECREF(self);
6824 return NULL;
6825 }
6826 str2 = PyUnicode_FromObject(replobj);
6827 if (str2 == NULL) {
6828 Py_DECREF(self);
6829 Py_DECREF(str1);
6830 return NULL;
6831 }
Tim Petersced69f82003-09-16 20:30:58 +00006832 result = replace((PyUnicodeObject *)self,
6833 (PyUnicodeObject *)str1,
6834 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 maxcount);
6836 Py_DECREF(self);
6837 Py_DECREF(str1);
6838 Py_DECREF(str2);
6839 return result;
6840}
6841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006842PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843"S.replace (old, new[, maxsplit]) -> unicode\n\
6844\n\
6845Return a copy of S with all occurrences of substring\n\
6846old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006847given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
6849static PyObject*
6850unicode_replace(PyUnicodeObject *self, PyObject *args)
6851{
6852 PyUnicodeObject *str1;
6853 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006854 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 PyObject *result;
6856
Martin v. Löwis18e16552006-02-15 17:27:45 +00006857 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 return NULL;
6859 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6860 if (str1 == NULL)
6861 return NULL;
6862 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006863 if (str2 == NULL) {
6864 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867
6868 result = replace(self, str1, str2, maxcount);
6869
6870 Py_DECREF(str1);
6871 Py_DECREF(str2);
6872 return result;
6873}
6874
6875static
6876PyObject *unicode_repr(PyObject *unicode)
6877{
Walter Dörwald79e913e2007-05-12 11:08:06 +00006878 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006879 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006880 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
6881 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
6882
6883 /* XXX(nnorwitz): rather than over-allocating, it would be
6884 better to choose a different scheme. Perhaps scan the
6885 first N-chars of the string and allocate based on that size.
6886 */
6887 /* Initial allocation is based on the longest-possible unichr
6888 escape.
6889
6890 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6891 unichr, so in this case it's the longest unichr escape. In
6892 narrow (UTF-16) builds this is five chars per source unichr
6893 since there are two unichrs in the surrogate pair, so in narrow
6894 (UTF-16) builds it's not the longest unichr escape.
6895
6896 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6897 so in the narrow (UTF-16) build case it's the longest unichr
6898 escape.
6899 */
6900
Walter Dörwald1ab83302007-05-18 17:15:44 +00006901 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00006902 2 /* quotes */
6903#ifdef Py_UNICODE_WIDE
6904 + 10*size
6905#else
6906 + 6*size
6907#endif
6908 + 1);
6909 if (repr == NULL)
6910 return NULL;
6911
Walter Dörwald1ab83302007-05-18 17:15:44 +00006912 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00006913
6914 /* Add quote */
6915 *p++ = (findchar(s, size, '\'') &&
6916 !findchar(s, size, '"')) ? '"' : '\'';
6917 while (size-- > 0) {
6918 Py_UNICODE ch = *s++;
6919
6920 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00006921 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00006922 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00006923 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006924 continue;
6925 }
6926
6927#ifdef Py_UNICODE_WIDE
6928 /* Map 21-bit characters to '\U00xxxxxx' */
6929 else if (ch >= 0x10000) {
6930 *p++ = '\\';
6931 *p++ = 'U';
6932 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6933 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6934 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6935 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6936 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6937 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6938 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6939 *p++ = hexdigits[ch & 0x0000000F];
6940 continue;
6941 }
6942#else
6943 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6944 else if (ch >= 0xD800 && ch < 0xDC00) {
6945 Py_UNICODE ch2;
6946 Py_UCS4 ucs;
6947
6948 ch2 = *s++;
6949 size--;
6950 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6951 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6952 *p++ = '\\';
6953 *p++ = 'U';
6954 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6955 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6956 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6957 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6958 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6959 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6960 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6961 *p++ = hexdigits[ucs & 0x0000000F];
6962 continue;
6963 }
6964 /* Fall through: isolated surrogates are copied as-is */
6965 s--;
6966 size++;
6967 }
6968#endif
6969
6970 /* Map 16-bit characters to '\uxxxx' */
6971 if (ch >= 256) {
6972 *p++ = '\\';
6973 *p++ = 'u';
6974 *p++ = hexdigits[(ch >> 12) & 0x000F];
6975 *p++ = hexdigits[(ch >> 8) & 0x000F];
6976 *p++ = hexdigits[(ch >> 4) & 0x000F];
6977 *p++ = hexdigits[ch & 0x000F];
6978 }
6979
6980 /* Map special whitespace to '\t', \n', '\r' */
6981 else if (ch == '\t') {
6982 *p++ = '\\';
6983 *p++ = 't';
6984 }
6985 else if (ch == '\n') {
6986 *p++ = '\\';
6987 *p++ = 'n';
6988 }
6989 else if (ch == '\r') {
6990 *p++ = '\\';
6991 *p++ = 'r';
6992 }
6993
6994 /* Map non-printable US ASCII to '\xhh' */
6995 else if (ch < ' ' || ch >= 0x7F) {
6996 *p++ = '\\';
6997 *p++ = 'x';
6998 *p++ = hexdigits[(ch >> 4) & 0x000F];
6999 *p++ = hexdigits[ch & 0x000F];
7000 }
7001
7002 /* Copy everything else as-is */
7003 else
7004 *p++ = (char) ch;
7005 }
7006 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007007 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007008
7009 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007010 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007011 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012}
7013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007014PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015"S.rfind(sub [,start [,end]]) -> int\n\
7016\n\
7017Return the highest index in S where substring sub is found,\n\
7018such that sub is contained within s[start,end]. Optional\n\
7019arguments start and end are interpreted as in slice notation.\n\
7020\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007021Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022
7023static PyObject *
7024unicode_rfind(PyUnicodeObject *self, PyObject *args)
7025{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007026 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007027 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007028 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007029 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030
Guido van Rossumb8872e62000-05-09 14:14:27 +00007031 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7032 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007034 substring = PyUnicode_FromObject(substring);
7035 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 return NULL;
7037
Thomas Wouters477c8d52006-05-27 19:21:47 +00007038 result = stringlib_rfind_slice(
7039 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7040 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7041 start, end
7042 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043
7044 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007045
7046 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047}
7048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007049PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050"S.rindex(sub [,start [,end]]) -> int\n\
7051\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007052Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053
7054static PyObject *
7055unicode_rindex(PyUnicodeObject *self, PyObject *args)
7056{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007057 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007058 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007059 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007060 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061
Guido van Rossumb8872e62000-05-09 14:14:27 +00007062 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7063 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007065 substring = PyUnicode_FromObject(substring);
7066 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 return NULL;
7068
Thomas Wouters477c8d52006-05-27 19:21:47 +00007069 result = stringlib_rfind_slice(
7070 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7071 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7072 start, end
7073 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074
7075 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007076
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 if (result < 0) {
7078 PyErr_SetString(PyExc_ValueError, "substring not found");
7079 return NULL;
7080 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007081 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082}
7083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007084PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007085"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086\n\
7087Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007088done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089
7090static PyObject *
7091unicode_rjust(PyUnicodeObject *self, PyObject *args)
7092{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007093 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007094 Py_UNICODE fillchar = ' ';
7095
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007096 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 return NULL;
7098
Tim Peters7a29bd52001-09-12 03:03:31 +00007099 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100 Py_INCREF(self);
7101 return (PyObject*) self;
7102 }
7103
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007104 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105}
7106
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007108unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109{
7110 /* standard clamping */
7111 if (start < 0)
7112 start = 0;
7113 if (end < 0)
7114 end = 0;
7115 if (end > self->length)
7116 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007117 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 /* full slice, return original string */
7119 Py_INCREF(self);
7120 return (PyObject*) self;
7121 }
7122 if (start > end)
7123 start = end;
7124 /* copy slice */
7125 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7126 end - start);
7127}
7128
7129PyObject *PyUnicode_Split(PyObject *s,
7130 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007131 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132{
7133 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007134
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 s = PyUnicode_FromObject(s);
7136 if (s == NULL)
7137 return NULL;
7138 if (sep != NULL) {
7139 sep = PyUnicode_FromObject(sep);
7140 if (sep == NULL) {
7141 Py_DECREF(s);
7142 return NULL;
7143 }
7144 }
7145
7146 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7147
7148 Py_DECREF(s);
7149 Py_XDECREF(sep);
7150 return result;
7151}
7152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007153PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154"S.split([sep [,maxsplit]]) -> list of strings\n\
7155\n\
7156Return a list of the words in S, using sep as the\n\
7157delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007158splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007159any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
7161static PyObject*
7162unicode_split(PyUnicodeObject *self, PyObject *args)
7163{
7164 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007165 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166
Martin v. Löwis18e16552006-02-15 17:27:45 +00007167 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 return NULL;
7169
7170 if (substring == Py_None)
7171 return split(self, NULL, maxcount);
7172 else if (PyUnicode_Check(substring))
7173 return split(self, (PyUnicodeObject *)substring, maxcount);
7174 else
7175 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7176}
7177
Thomas Wouters477c8d52006-05-27 19:21:47 +00007178PyObject *
7179PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7180{
7181 PyObject* str_obj;
7182 PyObject* sep_obj;
7183 PyObject* out;
7184
7185 str_obj = PyUnicode_FromObject(str_in);
7186 if (!str_obj)
7187 return NULL;
7188 sep_obj = PyUnicode_FromObject(sep_in);
7189 if (!sep_obj) {
7190 Py_DECREF(str_obj);
7191 return NULL;
7192 }
7193
7194 out = stringlib_partition(
7195 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7196 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7197 );
7198
7199 Py_DECREF(sep_obj);
7200 Py_DECREF(str_obj);
7201
7202 return out;
7203}
7204
7205
7206PyObject *
7207PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7208{
7209 PyObject* str_obj;
7210 PyObject* sep_obj;
7211 PyObject* out;
7212
7213 str_obj = PyUnicode_FromObject(str_in);
7214 if (!str_obj)
7215 return NULL;
7216 sep_obj = PyUnicode_FromObject(sep_in);
7217 if (!sep_obj) {
7218 Py_DECREF(str_obj);
7219 return NULL;
7220 }
7221
7222 out = stringlib_rpartition(
7223 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7224 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7225 );
7226
7227 Py_DECREF(sep_obj);
7228 Py_DECREF(str_obj);
7229
7230 return out;
7231}
7232
7233PyDoc_STRVAR(partition__doc__,
7234"S.partition(sep) -> (head, sep, tail)\n\
7235\n\
7236Searches for the separator sep in S, and returns the part before it,\n\
7237the separator itself, and the part after it. If the separator is not\n\
7238found, returns S and two empty strings.");
7239
7240static PyObject*
7241unicode_partition(PyUnicodeObject *self, PyObject *separator)
7242{
7243 return PyUnicode_Partition((PyObject *)self, separator);
7244}
7245
7246PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007247"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007248\n\
7249Searches for the separator sep in S, starting at the end of S, and returns\n\
7250the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007251separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007252
7253static PyObject*
7254unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7255{
7256 return PyUnicode_RPartition((PyObject *)self, separator);
7257}
7258
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007259PyObject *PyUnicode_RSplit(PyObject *s,
7260 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007261 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007262{
7263 PyObject *result;
7264
7265 s = PyUnicode_FromObject(s);
7266 if (s == NULL)
7267 return NULL;
7268 if (sep != NULL) {
7269 sep = PyUnicode_FromObject(sep);
7270 if (sep == NULL) {
7271 Py_DECREF(s);
7272 return NULL;
7273 }
7274 }
7275
7276 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7277
7278 Py_DECREF(s);
7279 Py_XDECREF(sep);
7280 return result;
7281}
7282
7283PyDoc_STRVAR(rsplit__doc__,
7284"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7285\n\
7286Return a list of the words in S, using sep as the\n\
7287delimiter string, starting at the end of the string and\n\
7288working to the front. If maxsplit is given, at most maxsplit\n\
7289splits are done. If sep is not specified, any whitespace string\n\
7290is a separator.");
7291
7292static PyObject*
7293unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7294{
7295 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007296 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007297
Martin v. Löwis18e16552006-02-15 17:27:45 +00007298 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007299 return NULL;
7300
7301 if (substring == Py_None)
7302 return rsplit(self, NULL, maxcount);
7303 else if (PyUnicode_Check(substring))
7304 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7305 else
7306 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7307}
7308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007309PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007310"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311\n\
7312Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007313Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007314is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
7316static PyObject*
7317unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7318{
Guido van Rossum86662912000-04-11 15:38:46 +00007319 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320
Guido van Rossum86662912000-04-11 15:38:46 +00007321 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 return NULL;
7323
Guido van Rossum86662912000-04-11 15:38:46 +00007324 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325}
7326
7327static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007328PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007330 PyObject *res = _PyUnicode_AsDefaultEncodedString(self, NULL);
7331 Py_XINCREF(res);
7332 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333}
7334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007335PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336"S.swapcase() -> unicode\n\
7337\n\
7338Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007339and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340
7341static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007342unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 return fixup(self, fixswapcase);
7345}
7346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007347PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348"S.translate(table) -> unicode\n\
7349\n\
7350Return a copy of the string S, where all characters have been mapped\n\
7351through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007352Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7353Unmapped characters are left untouched. Characters mapped to None\n\
7354are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355
7356static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007357unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358{
Tim Petersced69f82003-09-16 20:30:58 +00007359 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007361 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362 "ignore");
7363}
7364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007365PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366"S.upper() -> unicode\n\
7367\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007368Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369
7370static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007371unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373 return fixup(self, fixupper);
7374}
7375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007376PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377"S.zfill(width) -> unicode\n\
7378\n\
7379Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007380of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381
7382static PyObject *
7383unicode_zfill(PyUnicodeObject *self, PyObject *args)
7384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007385 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 PyUnicodeObject *u;
7387
Martin v. Löwis18e16552006-02-15 17:27:45 +00007388 Py_ssize_t width;
7389 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 return NULL;
7391
7392 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007393 if (PyUnicode_CheckExact(self)) {
7394 Py_INCREF(self);
7395 return (PyObject*) self;
7396 }
7397 else
7398 return PyUnicode_FromUnicode(
7399 PyUnicode_AS_UNICODE(self),
7400 PyUnicode_GET_SIZE(self)
7401 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 }
7403
7404 fill = width - self->length;
7405
7406 u = pad(self, fill, 0, '0');
7407
Walter Dörwald068325e2002-04-15 13:36:47 +00007408 if (u == NULL)
7409 return NULL;
7410
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 if (u->str[fill] == '+' || u->str[fill] == '-') {
7412 /* move sign to beginning of string */
7413 u->str[0] = u->str[fill];
7414 u->str[fill] = '0';
7415 }
7416
7417 return (PyObject*) u;
7418}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419
7420#if 0
7421static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007422unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 return PyInt_FromLong(unicode_freelist_size);
7425}
7426#endif
7427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007428PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007429"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007431Return True if S starts with the specified prefix, False otherwise.\n\
7432With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007433With optional end, stop comparing S at that position.\n\
7434prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435
7436static PyObject *
7437unicode_startswith(PyUnicodeObject *self,
7438 PyObject *args)
7439{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007442 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007443 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007446 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007447 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007449 if (PyTuple_Check(subobj)) {
7450 Py_ssize_t i;
7451 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7452 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7453 PyTuple_GET_ITEM(subobj, i));
7454 if (substring == NULL)
7455 return NULL;
7456 result = tailmatch(self, substring, start, end, -1);
7457 Py_DECREF(substring);
7458 if (result) {
7459 Py_RETURN_TRUE;
7460 }
7461 }
7462 /* nothing matched */
7463 Py_RETURN_FALSE;
7464 }
7465 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007467 return NULL;
7468 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007470 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471}
7472
7473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007474PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007475"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007477Return True if S ends with the specified suffix, False otherwise.\n\
7478With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479With optional end, stop comparing S at that position.\n\
7480suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481
7482static PyObject *
7483unicode_endswith(PyUnicodeObject *self,
7484 PyObject *args)
7485{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007488 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007489 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007492 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7493 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007495 if (PyTuple_Check(subobj)) {
7496 Py_ssize_t i;
7497 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7498 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7499 PyTuple_GET_ITEM(subobj, i));
7500 if (substring == NULL)
7501 return NULL;
7502 result = tailmatch(self, substring, start, end, +1);
7503 Py_DECREF(substring);
7504 if (result) {
7505 Py_RETURN_TRUE;
7506 }
7507 }
7508 Py_RETURN_FALSE;
7509 }
7510 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007516 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517}
7518
7519
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007520
7521static PyObject *
7522unicode_getnewargs(PyUnicodeObject *v)
7523{
7524 return Py_BuildValue("(u#)", v->str, v->length);
7525}
7526
7527
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528static PyMethodDef unicode_methods[] = {
7529
7530 /* Order is according to common usage: often used methods should
7531 appear first, since lookup is done sequentially. */
7532
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007533 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7534 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7535 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007536 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007537 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7538 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7539 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7540 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7541 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7542 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7543 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007544 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007545 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7546 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7547 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007548 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007549 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007550/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7551 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7552 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7553 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007554 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007555 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007556 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007557 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007558 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7559 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7560 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7561 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7562 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7563 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7564 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7565 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7566 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7567 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7568 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7569 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7570 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7571 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007572 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007573#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007574 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575#endif
7576
7577#if 0
7578 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007579 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580#endif
7581
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007582 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 {NULL, NULL}
7584};
7585
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007586static PyObject *
7587unicode_mod(PyObject *v, PyObject *w)
7588{
7589 if (!PyUnicode_Check(v)) {
7590 Py_INCREF(Py_NotImplemented);
7591 return Py_NotImplemented;
7592 }
7593 return PyUnicode_Format(v, w);
7594}
7595
7596static PyNumberMethods unicode_as_number = {
7597 0, /*nb_add*/
7598 0, /*nb_subtract*/
7599 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007600 unicode_mod, /*nb_remainder*/
7601};
7602
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007604 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007605 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007606 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7607 (ssizeargfunc) unicode_getitem, /* sq_item */
7608 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 0, /* sq_ass_item */
7610 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007611 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612};
7613
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007614static PyObject*
7615unicode_subscript(PyUnicodeObject* self, PyObject* item)
7616{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007617 if (PyIndex_Check(item)) {
7618 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007619 if (i == -1 && PyErr_Occurred())
7620 return NULL;
7621 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007622 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007623 return unicode_getitem(self, i);
7624 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007625 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007626 Py_UNICODE* source_buf;
7627 Py_UNICODE* result_buf;
7628 PyObject* result;
7629
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007630 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007631 &start, &stop, &step, &slicelength) < 0) {
7632 return NULL;
7633 }
7634
7635 if (slicelength <= 0) {
7636 return PyUnicode_FromUnicode(NULL, 0);
7637 } else {
7638 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007639 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7640 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007641
7642 if (result_buf == NULL)
7643 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007644
7645 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7646 result_buf[i] = source_buf[cur];
7647 }
Tim Petersced69f82003-09-16 20:30:58 +00007648
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007649 result = PyUnicode_FromUnicode(result_buf, slicelength);
7650 PyMem_FREE(result_buf);
7651 return result;
7652 }
7653 } else {
7654 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7655 return NULL;
7656 }
7657}
7658
7659static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007660 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007661 (binaryfunc)unicode_subscript, /* mp_subscript */
7662 (objobjargproc)0, /* mp_ass_subscript */
7663};
7664
Martin v. Löwis18e16552006-02-15 17:27:45 +00007665static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007667 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668 const void **ptr)
7669{
7670 if (index != 0) {
7671 PyErr_SetString(PyExc_SystemError,
7672 "accessing non-existent unicode segment");
7673 return -1;
7674 }
7675 *ptr = (void *) self->str;
7676 return PyUnicode_GET_DATA_SIZE(self);
7677}
7678
Martin v. Löwis18e16552006-02-15 17:27:45 +00007679static Py_ssize_t
7680unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 const void **ptr)
7682{
7683 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007684 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685 return -1;
7686}
7687
7688static int
7689unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007690 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691{
7692 if (lenp)
7693 *lenp = PyUnicode_GET_DATA_SIZE(self);
7694 return 1;
7695}
7696
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007697static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007699 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 const void **ptr)
7701{
7702 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007703
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 if (index != 0) {
7705 PyErr_SetString(PyExc_SystemError,
7706 "accessing non-existent unicode segment");
7707 return -1;
7708 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007709 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 if (str == NULL)
7711 return -1;
7712 *ptr = (void *) PyString_AS_STRING(str);
7713 return PyString_GET_SIZE(str);
7714}
7715
7716/* Helpers for PyUnicode_Format() */
7717
7718static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007719getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007721 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 if (argidx < arglen) {
7723 (*p_argidx)++;
7724 if (arglen < 0)
7725 return args;
7726 else
7727 return PyTuple_GetItem(args, argidx);
7728 }
7729 PyErr_SetString(PyExc_TypeError,
7730 "not enough arguments for format string");
7731 return NULL;
7732}
7733
7734#define F_LJUST (1<<0)
7735#define F_SIGN (1<<1)
7736#define F_BLANK (1<<2)
7737#define F_ALT (1<<3)
7738#define F_ZERO (1<<4)
7739
Martin v. Löwis18e16552006-02-15 17:27:45 +00007740static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007741strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007743 register Py_ssize_t i;
7744 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 for (i = len - 1; i >= 0; i--)
7746 buffer[i] = (Py_UNICODE) charbuffer[i];
7747
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 return len;
7749}
7750
Neal Norwitzfc76d632006-01-10 06:03:13 +00007751static int
7752doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7753{
Tim Peters15231542006-02-16 01:08:01 +00007754 Py_ssize_t result;
7755
Neal Norwitzfc76d632006-01-10 06:03:13 +00007756 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007757 result = strtounicode(buffer, (char *)buffer);
7758 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007759}
7760
7761static int
7762longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7763{
Tim Peters15231542006-02-16 01:08:01 +00007764 Py_ssize_t result;
7765
Neal Norwitzfc76d632006-01-10 06:03:13 +00007766 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007767 result = strtounicode(buffer, (char *)buffer);
7768 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007769}
7770
Guido van Rossum078151d2002-08-11 04:24:12 +00007771/* XXX To save some code duplication, formatfloat/long/int could have been
7772 shared with stringobject.c, converting from 8-bit to Unicode after the
7773 formatting is done. */
7774
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775static int
7776formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007777 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 int flags,
7779 int prec,
7780 int type,
7781 PyObject *v)
7782{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007783 /* fmt = '%#.' + `prec` + `type`
7784 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 char fmt[20];
7786 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007787
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788 x = PyFloat_AsDouble(v);
7789 if (x == -1.0 && PyErr_Occurred())
7790 return -1;
7791 if (prec < 0)
7792 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7794 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007795 /* Worst case length calc to ensure no buffer overrun:
7796
7797 'g' formats:
7798 fmt = %#.<prec>g
7799 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7800 for any double rep.)
7801 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7802
7803 'f' formats:
7804 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7805 len = 1 + 50 + 1 + prec = 52 + prec
7806
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007807 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007808 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007809
7810 */
7811 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7812 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007813 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007814 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007815 return -1;
7816 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007817 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7818 (flags&F_ALT) ? "#" : "",
7819 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007820 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821}
7822
Tim Peters38fd5b62000-09-21 05:43:11 +00007823static PyObject*
7824formatlong(PyObject *val, int flags, int prec, int type)
7825{
7826 char *buf;
7827 int i, len;
7828 PyObject *str; /* temporary string object. */
7829 PyUnicodeObject *result;
7830
7831 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7832 if (!str)
7833 return NULL;
7834 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007835 if (!result) {
7836 Py_DECREF(str);
7837 return NULL;
7838 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007839 for (i = 0; i < len; i++)
7840 result->str[i] = buf[i];
7841 result->str[len] = 0;
7842 Py_DECREF(str);
7843 return (PyObject*)result;
7844}
7845
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846static int
7847formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007848 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 int flags,
7850 int prec,
7851 int type,
7852 PyObject *v)
7853{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007854 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007855 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7856 * + 1 + 1
7857 * = 24
7858 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007859 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007860 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 long x;
7862
7863 x = PyInt_AsLong(v);
7864 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007865 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007866 if (x < 0 && type == 'u') {
7867 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007868 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007869 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7870 sign = "-";
7871 else
7872 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007874 prec = 1;
7875
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007876 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7877 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007878 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007879 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007880 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007881 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007882 return -1;
7883 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007884
7885 if ((flags & F_ALT) &&
7886 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007887 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007888 * of issues that cause pain:
7889 * - when 0 is being converted, the C standard leaves off
7890 * the '0x' or '0X', which is inconsistent with other
7891 * %#x/%#X conversions and inconsistent with Python's
7892 * hex() function
7893 * - there are platforms that violate the standard and
7894 * convert 0 with the '0x' or '0X'
7895 * (Metrowerks, Compaq Tru64)
7896 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007897 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007898 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007899 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007900 * We can achieve the desired consistency by inserting our
7901 * own '0x' or '0X' prefix, and substituting %x/%X in place
7902 * of %#x/%#X.
7903 *
7904 * Note that this is the same approach as used in
7905 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007906 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007907 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7908 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007909 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007910 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007911 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7912 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007913 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007914 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007915 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007916 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007917 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007918 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919}
7920
7921static int
7922formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007923 size_t buflen,
7924 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007926 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007927 if (PyUnicode_Check(v)) {
7928 if (PyUnicode_GET_SIZE(v) != 1)
7929 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007933 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007934 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007935 goto onError;
7936 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938
7939 else {
7940 /* Integer input truncated to a character */
7941 long x;
7942 x = PyInt_AsLong(v);
7943 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007944 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007945#ifdef Py_UNICODE_WIDE
7946 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007947 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007948 "%c arg not in range(0x110000) "
7949 "(wide Python build)");
7950 return -1;
7951 }
7952#else
7953 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007954 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007955 "%c arg not in range(0x10000) "
7956 "(narrow Python build)");
7957 return -1;
7958 }
7959#endif
7960 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 }
7962 buf[1] = '\0';
7963 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007964
7965 onError:
7966 PyErr_SetString(PyExc_TypeError,
7967 "%c requires int or char");
7968 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969}
7970
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007971/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7972
7973 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7974 chars are formatted. XXX This is a magic number. Each formatting
7975 routine does bounds checking to ensure no overflow, but a better
7976 solution may be to malloc a buffer of appropriate size for each
7977 format. For now, the current solution is sufficient.
7978*/
7979#define FORMATBUFLEN (size_t)120
7980
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981PyObject *PyUnicode_Format(PyObject *format,
7982 PyObject *args)
7983{
7984 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007985 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 int args_owned = 0;
7987 PyUnicodeObject *result = NULL;
7988 PyObject *dict = NULL;
7989 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007990
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 if (format == NULL || args == NULL) {
7992 PyErr_BadInternalCall();
7993 return NULL;
7994 }
7995 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007996 if (uformat == NULL)
7997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 fmt = PyUnicode_AS_UNICODE(uformat);
7999 fmtcnt = PyUnicode_GET_SIZE(uformat);
8000
8001 reslen = rescnt = fmtcnt + 100;
8002 result = _PyUnicode_New(reslen);
8003 if (result == NULL)
8004 goto onError;
8005 res = PyUnicode_AS_UNICODE(result);
8006
8007 if (PyTuple_Check(args)) {
8008 arglen = PyTuple_Size(args);
8009 argidx = 0;
8010 }
8011 else {
8012 arglen = -1;
8013 argidx = -2;
8014 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008015 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
8016 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 dict = args;
8018
8019 while (--fmtcnt >= 0) {
8020 if (*fmt != '%') {
8021 if (--rescnt < 0) {
8022 rescnt = fmtcnt + 100;
8023 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008024 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008025 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8027 --rescnt;
8028 }
8029 *res++ = *fmt++;
8030 }
8031 else {
8032 /* Got a format specifier */
8033 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008034 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 Py_UNICODE c = '\0';
8037 Py_UNICODE fill;
8038 PyObject *v = NULL;
8039 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008040 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008042 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008043 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044
8045 fmt++;
8046 if (*fmt == '(') {
8047 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008048 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 PyObject *key;
8050 int pcount = 1;
8051
8052 if (dict == NULL) {
8053 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008054 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 goto onError;
8056 }
8057 ++fmt;
8058 --fmtcnt;
8059 keystart = fmt;
8060 /* Skip over balanced parentheses */
8061 while (pcount > 0 && --fmtcnt >= 0) {
8062 if (*fmt == ')')
8063 --pcount;
8064 else if (*fmt == '(')
8065 ++pcount;
8066 fmt++;
8067 }
8068 keylen = fmt - keystart - 1;
8069 if (fmtcnt < 0 || pcount > 0) {
8070 PyErr_SetString(PyExc_ValueError,
8071 "incomplete format key");
8072 goto onError;
8073 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008074#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008075 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 then looked up since Python uses strings to hold
8077 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008078 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 key = PyUnicode_EncodeUTF8(keystart,
8080 keylen,
8081 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008082#else
8083 key = PyUnicode_FromUnicode(keystart, keylen);
8084#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 if (key == NULL)
8086 goto onError;
8087 if (args_owned) {
8088 Py_DECREF(args);
8089 args_owned = 0;
8090 }
8091 args = PyObject_GetItem(dict, key);
8092 Py_DECREF(key);
8093 if (args == NULL) {
8094 goto onError;
8095 }
8096 args_owned = 1;
8097 arglen = -1;
8098 argidx = -2;
8099 }
8100 while (--fmtcnt >= 0) {
8101 switch (c = *fmt++) {
8102 case '-': flags |= F_LJUST; continue;
8103 case '+': flags |= F_SIGN; continue;
8104 case ' ': flags |= F_BLANK; continue;
8105 case '#': flags |= F_ALT; continue;
8106 case '0': flags |= F_ZERO; continue;
8107 }
8108 break;
8109 }
8110 if (c == '*') {
8111 v = getnextarg(args, arglen, &argidx);
8112 if (v == NULL)
8113 goto onError;
8114 if (!PyInt_Check(v)) {
8115 PyErr_SetString(PyExc_TypeError,
8116 "* wants int");
8117 goto onError;
8118 }
8119 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008120 if (width == -1 && PyErr_Occurred())
8121 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 if (width < 0) {
8123 flags |= F_LJUST;
8124 width = -width;
8125 }
8126 if (--fmtcnt >= 0)
8127 c = *fmt++;
8128 }
8129 else if (c >= '0' && c <= '9') {
8130 width = c - '0';
8131 while (--fmtcnt >= 0) {
8132 c = *fmt++;
8133 if (c < '0' || c > '9')
8134 break;
8135 if ((width*10) / 10 != width) {
8136 PyErr_SetString(PyExc_ValueError,
8137 "width too big");
8138 goto onError;
8139 }
8140 width = width*10 + (c - '0');
8141 }
8142 }
8143 if (c == '.') {
8144 prec = 0;
8145 if (--fmtcnt >= 0)
8146 c = *fmt++;
8147 if (c == '*') {
8148 v = getnextarg(args, arglen, &argidx);
8149 if (v == NULL)
8150 goto onError;
8151 if (!PyInt_Check(v)) {
8152 PyErr_SetString(PyExc_TypeError,
8153 "* wants int");
8154 goto onError;
8155 }
8156 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008157 if (prec == -1 && PyErr_Occurred())
8158 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 if (prec < 0)
8160 prec = 0;
8161 if (--fmtcnt >= 0)
8162 c = *fmt++;
8163 }
8164 else if (c >= '0' && c <= '9') {
8165 prec = c - '0';
8166 while (--fmtcnt >= 0) {
8167 c = Py_CHARMASK(*fmt++);
8168 if (c < '0' || c > '9')
8169 break;
8170 if ((prec*10) / 10 != prec) {
8171 PyErr_SetString(PyExc_ValueError,
8172 "prec too big");
8173 goto onError;
8174 }
8175 prec = prec*10 + (c - '0');
8176 }
8177 }
8178 } /* prec */
8179 if (fmtcnt >= 0) {
8180 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 if (--fmtcnt >= 0)
8182 c = *fmt++;
8183 }
8184 }
8185 if (fmtcnt < 0) {
8186 PyErr_SetString(PyExc_ValueError,
8187 "incomplete format");
8188 goto onError;
8189 }
8190 if (c != '%') {
8191 v = getnextarg(args, arglen, &argidx);
8192 if (v == NULL)
8193 goto onError;
8194 }
8195 sign = 0;
8196 fill = ' ';
8197 switch (c) {
8198
8199 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008200 pbuf = formatbuf;
8201 /* presume that buffer length is at least 1 */
8202 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 len = 1;
8204 break;
8205
8206 case 's':
8207 case 'r':
8208 if (PyUnicode_Check(v) && c == 's') {
8209 temp = v;
8210 Py_INCREF(temp);
8211 }
8212 else {
8213 PyObject *unicode;
8214 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008215 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 else
8217 temp = PyObject_Repr(v);
8218 if (temp == NULL)
8219 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008220 if (PyUnicode_Check(temp))
8221 /* nothing to do */;
8222 else if (PyString_Check(temp)) {
8223 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008224 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008226 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008228 Py_DECREF(temp);
8229 temp = unicode;
8230 if (temp == NULL)
8231 goto onError;
8232 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008233 else {
8234 Py_DECREF(temp);
8235 PyErr_SetString(PyExc_TypeError,
8236 "%s argument has non-string str()");
8237 goto onError;
8238 }
8239 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008240 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 len = PyUnicode_GET_SIZE(temp);
8242 if (prec >= 0 && len > prec)
8243 len = prec;
8244 break;
8245
8246 case 'i':
8247 case 'd':
8248 case 'u':
8249 case 'o':
8250 case 'x':
8251 case 'X':
8252 if (c == 'i')
8253 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008254 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008255 temp = formatlong(v, flags, prec, c);
8256 if (!temp)
8257 goto onError;
8258 pbuf = PyUnicode_AS_UNICODE(temp);
8259 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008260 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008262 else {
8263 pbuf = formatbuf;
8264 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8265 flags, prec, c, v);
8266 if (len < 0)
8267 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008268 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008269 }
8270 if (flags & F_ZERO)
8271 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 break;
8273
8274 case 'e':
8275 case 'E':
8276 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008277 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 case 'g':
8279 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008280 if (c == 'F')
8281 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008282 pbuf = formatbuf;
8283 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8284 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 if (len < 0)
8286 goto onError;
8287 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008288 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 fill = '0';
8290 break;
8291
8292 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008293 pbuf = formatbuf;
8294 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 if (len < 0)
8296 goto onError;
8297 break;
8298
8299 default:
8300 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008301 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008302 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008303 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008304 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008305 (Py_ssize_t)(fmt - 1 -
8306 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 goto onError;
8308 }
8309 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008310 if (*pbuf == '-' || *pbuf == '+') {
8311 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 len--;
8313 }
8314 else if (flags & F_SIGN)
8315 sign = '+';
8316 else if (flags & F_BLANK)
8317 sign = ' ';
8318 else
8319 sign = 0;
8320 }
8321 if (width < len)
8322 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008323 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 reslen -= rescnt;
8325 rescnt = width + fmtcnt + 100;
8326 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008327 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008328 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008329 PyErr_NoMemory();
8330 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008331 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008332 if (_PyUnicode_Resize(&result, reslen) < 0) {
8333 Py_XDECREF(temp);
8334 goto onError;
8335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 res = PyUnicode_AS_UNICODE(result)
8337 + reslen - rescnt;
8338 }
8339 if (sign) {
8340 if (fill != ' ')
8341 *res++ = sign;
8342 rescnt--;
8343 if (width > len)
8344 width--;
8345 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008346 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8347 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008348 assert(pbuf[1] == c);
8349 if (fill != ' ') {
8350 *res++ = *pbuf++;
8351 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008352 }
Tim Petersfff53252001-04-12 18:38:48 +00008353 rescnt -= 2;
8354 width -= 2;
8355 if (width < 0)
8356 width = 0;
8357 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 if (width > len && !(flags & F_LJUST)) {
8360 do {
8361 --rescnt;
8362 *res++ = fill;
8363 } while (--width > len);
8364 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008365 if (fill == ' ') {
8366 if (sign)
8367 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008368 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008369 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008370 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008371 *res++ = *pbuf++;
8372 *res++ = *pbuf++;
8373 }
8374 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008375 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 res += len;
8377 rescnt -= len;
8378 while (--width >= len) {
8379 --rescnt;
8380 *res++ = ' ';
8381 }
8382 if (dict && (argidx < arglen) && c != '%') {
8383 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008384 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008385 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 goto onError;
8387 }
8388 Py_XDECREF(temp);
8389 } /* '%' */
8390 } /* until end */
8391 if (argidx < arglen && !dict) {
8392 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008393 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 goto onError;
8395 }
8396
Thomas Woutersa96affe2006-03-12 00:29:36 +00008397 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8398 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 if (args_owned) {
8400 Py_DECREF(args);
8401 }
8402 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 return (PyObject *)result;
8404
8405 onError:
8406 Py_XDECREF(result);
8407 Py_DECREF(uformat);
8408 if (args_owned) {
8409 Py_DECREF(args);
8410 }
8411 return NULL;
8412}
8413
8414static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008415 (readbufferproc) unicode_buffer_getreadbuf,
8416 (writebufferproc) unicode_buffer_getwritebuf,
8417 (segcountproc) unicode_buffer_getsegcount,
8418 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419};
8420
Jeremy Hylton938ace62002-07-17 16:30:39 +00008421static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008422unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8423
Tim Peters6d6c1a32001-08-02 04:15:00 +00008424static PyObject *
8425unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8426{
8427 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008428 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008429 char *encoding = NULL;
8430 char *errors = NULL;
8431
Guido van Rossume023fe02001-08-30 03:12:59 +00008432 if (type != &PyUnicode_Type)
8433 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008434 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8435 kwlist, &x, &encoding, &errors))
8436 return NULL;
8437 if (x == NULL)
8438 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008439 if (encoding == NULL && errors == NULL)
8440 return PyObject_Unicode(x);
8441 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008442 return PyUnicode_FromEncodedObject(x, encoding, errors);
8443}
8444
Guido van Rossume023fe02001-08-30 03:12:59 +00008445static PyObject *
8446unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8447{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008448 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008449 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008450
8451 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8452 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8453 if (tmp == NULL)
8454 return NULL;
8455 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008456 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008457 if (pnew == NULL) {
8458 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008459 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008460 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008461 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8462 if (pnew->str == NULL) {
8463 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008464 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008465 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008466 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008467 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008468 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8469 pnew->length = n;
8470 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008471 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008472 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008473}
8474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008475PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008476"unicode(string [, encoding[, errors]]) -> object\n\
8477\n\
8478Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008479encoding defaults to the current default string encoding.\n\
8480errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008481
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008482static PyObject *unicode_iter(PyObject *seq);
8483
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484PyTypeObject PyUnicode_Type = {
8485 PyObject_HEAD_INIT(&PyType_Type)
8486 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008487 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488 sizeof(PyUnicodeObject), /* tp_size */
8489 0, /* tp_itemsize */
8490 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008491 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008493 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008495 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008496 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008497 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008499 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 (hashfunc) unicode_hash, /* tp_hash*/
8501 0, /* tp_call*/
8502 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008503 PyObject_GenericGetAttr, /* tp_getattro */
8504 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008506 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8507 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008508 unicode_doc, /* tp_doc */
8509 0, /* tp_traverse */
8510 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008511 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008512 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008513 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008514 0, /* tp_iternext */
8515 unicode_methods, /* tp_methods */
8516 0, /* tp_members */
8517 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008518 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008519 0, /* tp_dict */
8520 0, /* tp_descr_get */
8521 0, /* tp_descr_set */
8522 0, /* tp_dictoffset */
8523 0, /* tp_init */
8524 0, /* tp_alloc */
8525 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008526 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527};
8528
8529/* Initialize the Unicode implementation */
8530
Thomas Wouters78890102000-07-22 19:25:51 +00008531void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008533 int i;
8534
Thomas Wouters477c8d52006-05-27 19:21:47 +00008535 /* XXX - move this array to unicodectype.c ? */
8536 Py_UNICODE linebreak[] = {
8537 0x000A, /* LINE FEED */
8538 0x000D, /* CARRIAGE RETURN */
8539 0x001C, /* FILE SEPARATOR */
8540 0x001D, /* GROUP SEPARATOR */
8541 0x001E, /* RECORD SEPARATOR */
8542 0x0085, /* NEXT LINE */
8543 0x2028, /* LINE SEPARATOR */
8544 0x2029, /* PARAGRAPH SEPARATOR */
8545 };
8546
Fred Drakee4315f52000-05-09 19:53:39 +00008547 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008548 unicode_freelist = NULL;
8549 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008551 if (!unicode_empty)
8552 return;
8553
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008554 for (i = 0; i < 256; i++)
8555 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008556 if (PyType_Ready(&PyUnicode_Type) < 0)
8557 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008558
8559 /* initialize the linebreak bloom filter */
8560 bloom_linebreak = make_bloom_mask(
8561 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8562 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008563
8564 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565}
8566
8567/* Finalize the Unicode implementation */
8568
8569void
Thomas Wouters78890102000-07-22 19:25:51 +00008570_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008572 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008573 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008575 Py_XDECREF(unicode_empty);
8576 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008577
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008578 for (i = 0; i < 256; i++) {
8579 if (unicode_latin1[i]) {
8580 Py_DECREF(unicode_latin1[i]);
8581 unicode_latin1[i] = NULL;
8582 }
8583 }
8584
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008585 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 PyUnicodeObject *v = u;
8587 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008588 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008589 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008590 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008591 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008593 unicode_freelist = NULL;
8594 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008596
Walter Dörwald16807132007-05-25 13:52:07 +00008597void
8598PyUnicode_InternInPlace(PyObject **p)
8599{
8600 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8601 PyObject *t;
8602 if (s == NULL || !PyUnicode_Check(s))
8603 Py_FatalError(
8604 "PyUnicode_InternInPlace: unicode strings only please!");
8605 /* If it's a subclass, we don't really know what putting
8606 it in the interned dict might do. */
8607 if (!PyUnicode_CheckExact(s))
8608 return;
8609 if (PyUnicode_CHECK_INTERNED(s))
8610 return;
8611 if (interned == NULL) {
8612 interned = PyDict_New();
8613 if (interned == NULL) {
8614 PyErr_Clear(); /* Don't leave an exception */
8615 return;
8616 }
8617 }
8618 t = PyDict_GetItem(interned, (PyObject *)s);
8619 if (t) {
8620 Py_INCREF(t);
8621 Py_DECREF(*p);
8622 *p = t;
8623 return;
8624 }
8625
8626 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8627 PyErr_Clear();
8628 return;
8629 }
8630 /* The two references in interned are not counted by refcnt.
8631 The deallocator will take care of this */
8632 s->ob_refcnt -= 2;
8633 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8634}
8635
8636void
8637PyUnicode_InternImmortal(PyObject **p)
8638{
8639 PyUnicode_InternInPlace(p);
8640 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8641 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8642 Py_INCREF(*p);
8643 }
8644}
8645
8646PyObject *
8647PyUnicode_InternFromString(const char *cp)
8648{
8649 PyObject *s = PyUnicode_FromString(cp);
8650 if (s == NULL)
8651 return NULL;
8652 PyUnicode_InternInPlace(&s);
8653 return s;
8654}
8655
8656void _Py_ReleaseInternedUnicodeStrings(void)
8657{
8658 PyObject *keys;
8659 PyUnicodeObject *s;
8660 Py_ssize_t i, n;
8661 Py_ssize_t immortal_size = 0, mortal_size = 0;
8662
8663 if (interned == NULL || !PyDict_Check(interned))
8664 return;
8665 keys = PyDict_Keys(interned);
8666 if (keys == NULL || !PyList_Check(keys)) {
8667 PyErr_Clear();
8668 return;
8669 }
8670
8671 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8672 detector, interned unicode strings are not forcibly deallocated;
8673 rather, we give them their stolen references back, and then clear
8674 and DECREF the interned dict. */
8675
8676 n = PyList_GET_SIZE(keys);
8677 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8678 n);
8679 for (i = 0; i < n; i++) {
8680 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8681 switch (s->state) {
8682 case SSTATE_NOT_INTERNED:
8683 /* XXX Shouldn't happen */
8684 break;
8685 case SSTATE_INTERNED_IMMORTAL:
8686 s->ob_refcnt += 1;
8687 immortal_size += s->length;
8688 break;
8689 case SSTATE_INTERNED_MORTAL:
8690 s->ob_refcnt += 2;
8691 mortal_size += s->length;
8692 break;
8693 default:
8694 Py_FatalError("Inconsistent interned string state.");
8695 }
8696 s->state = SSTATE_NOT_INTERNED;
8697 }
8698 fprintf(stderr, "total size of all interned strings: "
8699 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8700 "mortal/immortal\n", mortal_size, immortal_size);
8701 Py_DECREF(keys);
8702 PyDict_Clear(interned);
8703 Py_DECREF(interned);
8704 interned = NULL;
8705}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008706
8707
8708/********************* Unicode Iterator **************************/
8709
8710typedef struct {
8711 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008712 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008713 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8714} unicodeiterobject;
8715
8716static void
8717unicodeiter_dealloc(unicodeiterobject *it)
8718{
8719 _PyObject_GC_UNTRACK(it);
8720 Py_XDECREF(it->it_seq);
8721 PyObject_GC_Del(it);
8722}
8723
8724static int
8725unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8726{
8727 Py_VISIT(it->it_seq);
8728 return 0;
8729}
8730
8731static PyObject *
8732unicodeiter_next(unicodeiterobject *it)
8733{
8734 PyUnicodeObject *seq;
8735 PyObject *item;
8736
8737 assert(it != NULL);
8738 seq = it->it_seq;
8739 if (seq == NULL)
8740 return NULL;
8741 assert(PyUnicode_Check(seq));
8742
8743 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008744 item = PyUnicode_FromUnicode(
8745 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008746 if (item != NULL)
8747 ++it->it_index;
8748 return item;
8749 }
8750
8751 Py_DECREF(seq);
8752 it->it_seq = NULL;
8753 return NULL;
8754}
8755
8756static PyObject *
8757unicodeiter_len(unicodeiterobject *it)
8758{
8759 Py_ssize_t len = 0;
8760 if (it->it_seq)
8761 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8762 return PyInt_FromSsize_t(len);
8763}
8764
8765PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8766
8767static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008768 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8769 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008770 {NULL, NULL} /* sentinel */
8771};
8772
8773PyTypeObject PyUnicodeIter_Type = {
8774 PyObject_HEAD_INIT(&PyType_Type)
8775 0, /* ob_size */
8776 "unicodeiterator", /* tp_name */
8777 sizeof(unicodeiterobject), /* tp_basicsize */
8778 0, /* tp_itemsize */
8779 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008780 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008781 0, /* tp_print */
8782 0, /* tp_getattr */
8783 0, /* tp_setattr */
8784 0, /* tp_compare */
8785 0, /* tp_repr */
8786 0, /* tp_as_number */
8787 0, /* tp_as_sequence */
8788 0, /* tp_as_mapping */
8789 0, /* tp_hash */
8790 0, /* tp_call */
8791 0, /* tp_str */
8792 PyObject_GenericGetAttr, /* tp_getattro */
8793 0, /* tp_setattro */
8794 0, /* tp_as_buffer */
8795 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8796 0, /* tp_doc */
8797 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8798 0, /* tp_clear */
8799 0, /* tp_richcompare */
8800 0, /* tp_weaklistoffset */
8801 PyObject_SelfIter, /* tp_iter */
8802 (iternextfunc)unicodeiter_next, /* tp_iternext */
8803 unicodeiter_methods, /* tp_methods */
8804 0,
8805};
8806
8807static PyObject *
8808unicode_iter(PyObject *seq)
8809{
8810 unicodeiterobject *it;
8811
8812 if (!PyUnicode_Check(seq)) {
8813 PyErr_BadInternalCall();
8814 return NULL;
8815 }
8816 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8817 if (it == NULL)
8818 return NULL;
8819 it->it_index = 0;
8820 Py_INCREF(seq);
8821 it->it_seq = (PyUnicodeObject *)seq;
8822 _PyObject_GC_TRACK(it);
8823 return (PyObject *)it;
8824}
8825
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008826#ifdef __cplusplus
8827}
8828#endif
8829
8830
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008831/*
8832Local variables:
8833c-basic-offset: 4
8834indent-tabs-mode: nil
8835End:
8836*/