blob: 2728f1f64937ef642fba59b2bc700e031ec5a3b8 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
308 unicode->ob_refcnt = 3;
309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000341 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000355 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000441 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
446 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000447 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
Martin v. Löwis5b222132007-06-10 09:51:05 +0000461 while (size--)
462 *p++ = *u++;
463 /* Don't need to write trailing 0 because
464 that's already done by _PyUnicode_New */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
467 return (PyObject *)unicode;
468}
469
Walter Dörwaldd2034312007-05-18 16:29:38 +0000470PyObject *PyUnicode_FromString(const char *u)
471{
472 size_t size = strlen(u);
473 if (size > PY_SSIZE_T_MAX) {
474 PyErr_SetString(PyExc_OverflowError, "input too long");
475 return NULL;
476 }
477
478 return PyUnicode_FromStringAndSize(u, size);
479}
480
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481#ifdef HAVE_WCHAR_H
482
483PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000484 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485{
486 PyUnicodeObject *unicode;
487
488 if (w == NULL) {
489 PyErr_BadInternalCall();
490 return NULL;
491 }
492
493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the wchar_t data into the new object */
498#ifdef HAVE_USABLE_WCHAR_T
499 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000500#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 {
502 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000505 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 *u++ = *w++;
507 }
508#endif
509
510 return (PyObject *)unicode;
511}
512
Walter Dörwald346737f2007-05-31 10:44:43 +0000513static void
514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
515{
516 *fmt++ = '%';
517 if (width) {
518 if (zeropad)
519 *fmt++ = '0';
520 fmt += sprintf(fmt, "%d", width);
521 }
522 if (precision)
523 fmt += sprintf(fmt, ".%d", precision);
524 if (longflag)
525 *fmt++ = 'l';
526 else if (size_tflag) {
527 char *f = PY_FORMAT_SIZE_T;
528 while (*f)
529 *fmt++ = *f++;
530 }
531 *fmt++ = c;
532 *fmt = '\0';
533}
534
Walter Dörwaldd2034312007-05-18 16:29:38 +0000535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
536
537PyObject *
538PyUnicode_FromFormatV(const char *format, va_list vargs)
539{
540 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000541 Py_ssize_t callcount = 0;
542 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000543 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000545 int width = 0;
546 int precision = 0;
547 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 const char* f;
549 Py_UNICODE *s;
550 PyObject *string;
551 /* used by sprintf */
552 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000553 /* use abuffer instead of buffer, if we need more space
554 * (which can happen if there's a format specifier with width). */
555 char *abuffer = NULL;
556 char *realbuffer;
557 Py_ssize_t abuffersize = 0;
558 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000559 const char *copy;
560
561#ifdef VA_LIST_IS_ARRAY
562 Py_MEMCPY(count, vargs, sizeof(va_list));
563#else
564#ifdef __va_copy
565 __va_copy(count, vargs);
566#else
567 count = vargs;
568#endif
569#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 /* step 1: count the number of %S/%R format specifications
571 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
572 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000573 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 ++callcount;
576 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 /* step 2: allocate memory for the results of
578 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 if (callcount) {
580 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
581 if (!callresults) {
582 PyErr_NoMemory();
583 return NULL;
584 }
585 callresult = callresults;
586 }
587 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 for (f = format; *f; f++) {
589 if (*f == '%') {
590 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000591 width = 0;
592 while (isdigit(Py_CHARMASK(*f)))
593 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000594 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
595 ;
596
597 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
598 * they don't affect the amount of space we reserve.
599 */
600 if ((*f == 'l' || *f == 'z') &&
601 (f[1] == 'd' || f[1] == 'u'))
602 ++f;
603
604 switch (*f) {
605 case 'c':
606 (void)va_arg(count, int);
607 /* fall through... */
608 case '%':
609 n++;
610 break;
611 case 'd': case 'u': case 'i': case 'x':
612 (void) va_arg(count, int);
613 /* 20 bytes is enough to hold a 64-bit
614 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000615 This isn't enough for octal.
616 If a width is specified we need more
617 (which we allocate later). */
618 if (width < 20)
619 width = 20;
620 n += width;
621 if (abuffersize < width)
622 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000623 break;
624 case 's':
625 n += strlen(va_arg(count, char*));
626 break;
627 case 'U':
628 {
629 PyObject *obj = va_arg(count, PyObject *);
630 assert(obj && PyUnicode_Check(obj));
631 n += PyUnicode_GET_SIZE(obj);
632 break;
633 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000634 case 'V':
635 {
636 PyObject *obj = va_arg(count, PyObject *);
637 const char *str = va_arg(count, const char *);
638 assert(obj || str);
639 assert(!obj || PyUnicode_Check(obj));
640 if (obj)
641 n += PyUnicode_GET_SIZE(obj);
642 else
643 n += strlen(str);
644 break;
645 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000646 case 'S':
647 {
648 PyObject *obj = va_arg(count, PyObject *);
649 PyObject *str;
650 assert(obj);
651 str = PyObject_Unicode(obj);
652 if (!str)
653 goto fail;
654 n += PyUnicode_GET_SIZE(str);
655 /* Remember the str and switch to the next slot */
656 *callresult++ = str;
657 break;
658 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 case 'R':
660 {
661 PyObject *obj = va_arg(count, PyObject *);
662 PyObject *repr;
663 assert(obj);
664 repr = PyObject_Repr(obj);
665 if (!repr)
666 goto fail;
667 n += PyUnicode_GET_SIZE(repr);
668 /* Remember the repr and switch to the next slot */
669 *callresult++ = repr;
670 break;
671 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000672 case 'p':
673 (void) va_arg(count, int);
674 /* maximum 64-bit pointer representation:
675 * 0xffffffffffffffff
676 * so 19 characters is enough.
677 * XXX I count 18 -- what's the extra for?
678 */
679 n += 19;
680 break;
681 default:
682 /* if we stumble upon an unknown
683 formatting code, copy the rest of
684 the format string to the output
685 string. (we cannot just skip the
686 code, since there's no way to know
687 what's in the argument list) */
688 n += strlen(p);
689 goto expand;
690 }
691 } else
692 n++;
693 }
694 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000695 if (abuffersize > 20) {
696 abuffer = PyMem_Malloc(abuffersize);
697 if (!abuffer) {
698 PyErr_NoMemory();
699 goto fail;
700 }
701 realbuffer = abuffer;
702 }
703 else
704 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000705 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000707 we don't have to resize the string.
708 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 string = PyUnicode_FromUnicode(NULL, n);
710 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000711 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000714 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000715
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f++;
719 int longflag = 0;
720 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 zeropad = (*f == '0');
722 /* parse the width.precision part */
723 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000725 width = (width*10) + *f++ - '0';
726 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 if (*f == '.') {
728 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000732 /* handle the long flag, but only for %ld and %lu.
733 others can be added when necessary. */
734 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
735 longflag = 1;
736 ++f;
737 }
738 /* handle the size_t flag. */
739 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
740 size_tflag = 1;
741 ++f;
742 }
743
744 switch (*f) {
745 case 'c':
746 *s++ = va_arg(vargs, int);
747 break;
748 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000753 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 sprintf(realbuffer, fmt, va_arg(vargs, int));
756 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 break;
758 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000763 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
766 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 break;
768 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000769 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
770 sprintf(realbuffer, fmt, va_arg(vargs, int));
771 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000772 break;
773 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000774 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
775 sprintf(realbuffer, fmt, va_arg(vargs, int));
776 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000777 break;
778 case 's':
779 p = va_arg(vargs, char*);
780 appendstring(p);
781 break;
782 case 'U':
783 {
784 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000785 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
786 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
787 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000788 break;
789 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000790 case 'V':
791 {
792 PyObject *obj = va_arg(vargs, PyObject *);
793 const char *str = va_arg(vargs, const char *);
794 if (obj) {
795 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
796 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
797 s += size;
798 } else {
799 appendstring(str);
800 }
801 break;
802 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000803 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000804 case 'R':
805 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000806 Py_UNICODE *ucopy;
807 Py_ssize_t usize;
808 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 /* unused, since we already have the result */
810 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000811 ucopy = PyUnicode_AS_UNICODE(*callresult);
812 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000813 for (upos = 0; upos<usize;)
814 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000815 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000817 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000818 ++callresult;
819 break;
820 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000821 case 'p':
822 sprintf(buffer, "%p", va_arg(vargs, void*));
823 /* %p is ill-defined: ensure leading 0x. */
824 if (buffer[1] == 'X')
825 buffer[1] = 'x';
826 else if (buffer[1] != 'x') {
827 memmove(buffer+2, buffer, strlen(buffer)+1);
828 buffer[0] = '0';
829 buffer[1] = 'x';
830 }
831 appendstring(buffer);
832 break;
833 case '%':
834 *s++ = '%';
835 break;
836 default:
837 appendstring(p);
838 goto end;
839 }
840 } else
841 *s++ = *f;
842 }
843
844 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000845 if (callresults)
846 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000847 if (abuffer)
848 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000849 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
850 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 fail:
852 if (callresults) {
853 PyObject **callresult2 = callresults;
854 while (callresult2 <= callresult) {
855 Py_DECREF(*callresult2);
856 ++callresult2;
857 }
858 PyMem_Free(callresults);
859 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 if (abuffer)
861 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000862 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863}
864
865#undef appendstring
866
867PyObject *
868PyUnicode_FromFormat(const char *format, ...)
869{
870 PyObject* ret;
871 va_list vargs;
872
873#ifdef HAVE_STDARG_PROTOTYPES
874 va_start(vargs, format);
875#else
876 va_start(vargs);
877#endif
878 ret = PyUnicode_FromFormatV(format, vargs);
879 va_end(vargs);
880 return ret;
881}
882
Martin v. Löwis18e16552006-02-15 17:27:45 +0000883Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
884 wchar_t *w,
885 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886{
887 if (unicode == NULL) {
888 PyErr_BadInternalCall();
889 return -1;
890 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000891
892 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000894 size = PyUnicode_GET_SIZE(unicode) + 1;
895
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896#ifdef HAVE_USABLE_WCHAR_T
897 memcpy(w, unicode->str, size * sizeof(wchar_t));
898#else
899 {
900 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000901 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000903 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 *w++ = *u++;
905 }
906#endif
907
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000908 if (size > PyUnicode_GET_SIZE(unicode))
909 return PyUnicode_GET_SIZE(unicode);
910 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 return size;
912}
913
914#endif
915
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916PyObject *PyUnicode_FromOrdinal(int ordinal)
917{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000918 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919
920#ifdef Py_UNICODE_WIDE
921 if (ordinal < 0 || ordinal > 0x10ffff) {
922 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000923 "chr() arg not in range(0x110000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000924 "(wide Python build)");
925 return NULL;
926 }
927#else
928 if (ordinal < 0 || ordinal > 0xffff) {
929 PyErr_SetString(PyExc_ValueError,
Walter Dörwaldb41bb792007-06-05 20:02:26 +0000930 "chr() arg not in range(0x10000) "
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000931 "(narrow Python build)");
932 return NULL;
933 }
934#endif
935
Hye-Shik Chang40574832004-04-06 07:24:51 +0000936 s[0] = (Py_UNICODE)ordinal;
937 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000938}
939
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940PyObject *PyUnicode_FromObject(register PyObject *obj)
941{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000942 /* XXX Perhaps we should make this API an alias of
943 PyObject_Unicode() instead ?! */
944 if (PyUnicode_CheckExact(obj)) {
945 Py_INCREF(obj);
946 return obj;
947 }
948 if (PyUnicode_Check(obj)) {
949 /* For a Unicode subtype that's not a Unicode object,
950 return a true Unicode object with the same data. */
951 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
952 PyUnicode_GET_SIZE(obj));
953 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000954 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
955}
956
957PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
958 const char *encoding,
959 const char *errors)
960{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000961 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000962 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000963 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000964
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965 if (obj == NULL) {
966 PyErr_BadInternalCall();
967 return NULL;
968 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000969
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000970#if 0
971 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000972 that no encodings is given and then redirect to
973 PyObject_Unicode() which then applies the additional logic for
974 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000975
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000976 NOTE: This API should really only be used for object which
977 represent *encoded* Unicode !
978
979 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000980 if (PyUnicode_Check(obj)) {
981 if (encoding) {
982 PyErr_SetString(PyExc_TypeError,
983 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000984 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000985 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000986 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000987 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000988#else
989 if (PyUnicode_Check(obj)) {
990 PyErr_SetString(PyExc_TypeError,
991 "decoding Unicode is not supported");
992 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000993 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000994#endif
995
996 /* Coerce object */
997 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000998 s = PyString_AS_STRING(obj);
999 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001000 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001001 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1002 /* Overwrite the error message with something more useful in
1003 case of a TypeError. */
1004 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001005 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001006 "coercing to Unicode: need string or buffer, "
1007 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001008 obj->ob_type->tp_name);
1009 goto onError;
1010 }
Tim Petersced69f82003-09-16 20:30:58 +00001011
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001012 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013 if (len == 0) {
1014 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001015 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001016 }
Tim Petersced69f82003-09-16 20:30:58 +00001017 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001019
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001020 return v;
1021
1022 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024}
1025
1026PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001027 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 const char *encoding,
1029 const char *errors)
1030{
1031 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001032
1033 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001034 encoding = PyUnicode_GetDefaultEncoding();
1035
1036 /* Shortcuts for common default encodings */
1037 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001039 else if (strcmp(encoding, "latin-1") == 0)
1040 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001041#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1042 else if (strcmp(encoding, "mbcs") == 0)
1043 return PyUnicode_DecodeMBCS(s, size, errors);
1044#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001045 else if (strcmp(encoding, "ascii") == 0)
1046 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047
1048 /* Decode via the codec registry */
1049 buffer = PyBuffer_FromMemory((void *)s, size);
1050 if (buffer == NULL)
1051 goto onError;
1052 unicode = PyCodec_Decode(buffer, encoding, errors);
1053 if (unicode == NULL)
1054 goto onError;
1055 if (!PyUnicode_Check(unicode)) {
1056 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001057 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 unicode->ob_type->tp_name);
1059 Py_DECREF(unicode);
1060 goto onError;
1061 }
1062 Py_DECREF(buffer);
1063 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001064
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065 onError:
1066 Py_XDECREF(buffer);
1067 return NULL;
1068}
1069
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001070PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1071 const char *encoding,
1072 const char *errors)
1073{
1074 PyObject *v;
1075
1076 if (!PyUnicode_Check(unicode)) {
1077 PyErr_BadArgument();
1078 goto onError;
1079 }
1080
1081 if (encoding == NULL)
1082 encoding = PyUnicode_GetDefaultEncoding();
1083
1084 /* Decode via the codec registry */
1085 v = PyCodec_Decode(unicode, encoding, errors);
1086 if (v == NULL)
1087 goto onError;
1088 return v;
1089
1090 onError:
1091 return NULL;
1092}
1093
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001095 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 const char *encoding,
1097 const char *errors)
1098{
1099 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001100
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 unicode = PyUnicode_FromUnicode(s, size);
1102 if (unicode == NULL)
1103 return NULL;
1104 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1105 Py_DECREF(unicode);
1106 return v;
1107}
1108
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001109PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1110 const char *encoding,
1111 const char *errors)
1112{
1113 PyObject *v;
1114
1115 if (!PyUnicode_Check(unicode)) {
1116 PyErr_BadArgument();
1117 goto onError;
1118 }
1119
1120 if (encoding == NULL)
1121 encoding = PyUnicode_GetDefaultEncoding();
1122
1123 /* Encode via the codec registry */
1124 v = PyCodec_Encode(unicode, encoding, errors);
1125 if (v == NULL)
1126 goto onError;
1127 return v;
1128
1129 onError:
1130 return NULL;
1131}
1132
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1134 const char *encoding,
1135 const char *errors)
1136{
1137 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001138
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139 if (!PyUnicode_Check(unicode)) {
1140 PyErr_BadArgument();
1141 goto onError;
1142 }
Fred Drakee4315f52000-05-09 19:53:39 +00001143
Tim Petersced69f82003-09-16 20:30:58 +00001144 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001145 encoding = PyUnicode_GetDefaultEncoding();
1146
1147 /* Shortcuts for common default encodings */
1148 if (errors == NULL) {
1149 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001150 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001151 else if (strcmp(encoding, "latin-1") == 0)
1152 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001153#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1154 else if (strcmp(encoding, "mbcs") == 0)
1155 return PyUnicode_AsMBCSString(unicode);
1156#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001157 else if (strcmp(encoding, "ascii") == 0)
1158 return PyUnicode_AsASCIIString(unicode);
1159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160
1161 /* Encode via the codec registry */
1162 v = PyCodec_Encode(unicode, encoding, errors);
1163 if (v == NULL)
1164 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001165 if (!PyBytes_Check(v)) {
1166 if (PyString_Check(v)) {
1167 /* Old codec, turn it into bytes */
1168 PyObject *b = PyBytes_FromObject(v);
1169 Py_DECREF(v);
1170 return b;
1171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001173 "encoder did not return a bytes object "
1174 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1175 v->ob_type->tp_name,
1176 encoding ? encoding : "NULL",
1177 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 Py_DECREF(v);
1179 goto onError;
1180 }
1181 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 onError:
1184 return NULL;
1185}
1186
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001187PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1188 const char *errors)
1189{
1190 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001191 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001192 if (v)
1193 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001194 if (errors != NULL)
1195 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1196 if (errors == NULL) {
1197 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1198 PyUnicode_GET_SIZE(unicode),
1199 NULL);
1200 }
1201 else {
1202 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1203 }
1204 if (!b)
1205 return NULL;
1206 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1207 PyBytes_Size(b));
1208 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001209 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001210 return v;
1211}
1212
Martin v. Löwis5b222132007-06-10 09:51:05 +00001213char*
1214PyUnicode_AsString(PyObject *unicode)
1215{
1216 assert(PyUnicode_Check(unicode));
1217 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1218 if (!unicode)
1219 return NULL;
1220 return PyString_AsString(unicode);
1221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1224{
1225 if (!PyUnicode_Check(unicode)) {
1226 PyErr_BadArgument();
1227 goto onError;
1228 }
1229 return PyUnicode_AS_UNICODE(unicode);
1230
1231 onError:
1232 return NULL;
1233}
1234
Martin v. Löwis18e16552006-02-15 17:27:45 +00001235Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236{
1237 if (!PyUnicode_Check(unicode)) {
1238 PyErr_BadArgument();
1239 goto onError;
1240 }
1241 return PyUnicode_GET_SIZE(unicode);
1242
1243 onError:
1244 return -1;
1245}
1246
Thomas Wouters78890102000-07-22 19:25:51 +00001247const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001248{
1249 return unicode_default_encoding;
1250}
1251
1252int PyUnicode_SetDefaultEncoding(const char *encoding)
1253{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001254 if (strcmp(encoding, unicode_default_encoding) != 0) {
1255 PyErr_Format(PyExc_ValueError,
1256 "Can only set default encoding to %s",
1257 unicode_default_encoding);
1258 return -1;
1259 }
Fred Drakee4315f52000-05-09 19:53:39 +00001260 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001261}
1262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001263/* error handling callback helper:
1264 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001265 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001266 and adjust various state variables.
1267 return 0 on success, -1 on error
1268*/
1269
1270static
1271int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1272 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001273 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1274 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001275{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001276 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001277
1278 PyObject *restuple = NULL;
1279 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001280 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1281 Py_ssize_t requiredsize;
1282 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001283 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001284 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001285 int res = -1;
1286
1287 if (*errorHandler == NULL) {
1288 *errorHandler = PyCodec_LookupError(errors);
1289 if (*errorHandler == NULL)
1290 goto onError;
1291 }
1292
1293 if (*exceptionObject == NULL) {
1294 *exceptionObject = PyUnicodeDecodeError_Create(
1295 encoding, input, insize, *startinpos, *endinpos, reason);
1296 if (*exceptionObject == NULL)
1297 goto onError;
1298 }
1299 else {
1300 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1301 goto onError;
1302 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1303 goto onError;
1304 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1305 goto onError;
1306 }
1307
1308 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1309 if (restuple == NULL)
1310 goto onError;
1311 if (!PyTuple_Check(restuple)) {
1312 PyErr_Format(PyExc_TypeError, &argparse[4]);
1313 goto onError;
1314 }
1315 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1316 goto onError;
1317 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001318 newpos = insize+newpos;
1319 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001320 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001321 goto onError;
1322 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001323
1324 /* need more space? (at least enough for what we
1325 have+the replacement+the rest of the string (starting
1326 at the new input position), so we won't have to check space
1327 when there are no errors in the rest of the string) */
1328 repptr = PyUnicode_AS_UNICODE(repunicode);
1329 repsize = PyUnicode_GET_SIZE(repunicode);
1330 requiredsize = *outpos + repsize + insize-newpos;
1331 if (requiredsize > outsize) {
1332 if (requiredsize<2*outsize)
1333 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001334 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001335 goto onError;
1336 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1337 }
1338 *endinpos = newpos;
1339 *inptr = input + newpos;
1340 Py_UNICODE_COPY(*outptr, repptr, repsize);
1341 *outptr += repsize;
1342 *outpos += repsize;
1343 /* we made it! */
1344 res = 0;
1345
1346 onError:
1347 Py_XDECREF(restuple);
1348 return res;
1349}
1350
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001351/* --- UTF-7 Codec -------------------------------------------------------- */
1352
1353/* see RFC2152 for details */
1354
Tim Petersced69f82003-09-16 20:30:58 +00001355static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001356char utf7_special[128] = {
1357 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1358 encoded:
1359 0 - not special
1360 1 - special
1361 2 - whitespace (optional)
1362 3 - RFC2152 Set O (optional) */
1363 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1364 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1365 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1366 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1367 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1368 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1369 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1370 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1371
1372};
1373
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001374/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1375 warnings about the comparison always being false; since
1376 utf7_special[0] is 1, we can safely make that one comparison
1377 true */
1378
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001379#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001380 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001381 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001382 (encodeO && (utf7_special[(c)] == 3)))
1383
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001384#define B64(n) \
1385 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1386#define B64CHAR(c) \
1387 (isalnum(c) || (c) == '+' || (c) == '/')
1388#define UB64(c) \
1389 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1390 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001391
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001392#define ENCODE(out, ch, bits) \
1393 while (bits >= 6) { \
1394 *out++ = B64(ch >> (bits-6)); \
1395 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001396 }
1397
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001398#define DECODE(out, ch, bits, surrogate) \
1399 while (bits >= 16) { \
1400 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1401 bits -= 16; \
1402 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001403 /* We have already generated an error for the high surrogate \
1404 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001405 surrogate = 0; \
1406 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001407 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001408 it in a 16-bit character */ \
1409 surrogate = 1; \
1410 errmsg = "code pairs are not supported"; \
1411 goto utf7Error; \
1412 } else { \
1413 *out++ = outCh; \
1414 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001415 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001416
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001417PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001418 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001419 const char *errors)
1420{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001421 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001422 Py_ssize_t startinpos;
1423 Py_ssize_t endinpos;
1424 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001425 const char *e;
1426 PyUnicodeObject *unicode;
1427 Py_UNICODE *p;
1428 const char *errmsg = "";
1429 int inShift = 0;
1430 unsigned int bitsleft = 0;
1431 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001432 int surrogate = 0;
1433 PyObject *errorHandler = NULL;
1434 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001435
1436 unicode = _PyUnicode_New(size);
1437 if (!unicode)
1438 return NULL;
1439 if (size == 0)
1440 return (PyObject *)unicode;
1441
1442 p = unicode->str;
1443 e = s + size;
1444
1445 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 Py_UNICODE ch;
1447 restart:
1448 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001449
1450 if (inShift) {
1451 if ((ch == '-') || !B64CHAR(ch)) {
1452 inShift = 0;
1453 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001454
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001455 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1456 if (bitsleft >= 6) {
1457 /* The shift sequence has a partial character in it. If
1458 bitsleft < 6 then we could just classify it as padding
1459 but that is not the case here */
1460
1461 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001462 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001463 }
1464 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001465 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001466 here so indicate the potential of a misencoded character. */
1467
1468 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1469 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1470 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001471 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001472 }
1473
1474 if (ch == '-') {
1475 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001476 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477 inShift = 1;
1478 }
1479 } else if (SPECIAL(ch,0,0)) {
1480 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001481 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001482 } else {
1483 *p++ = ch;
1484 }
1485 } else {
1486 charsleft = (charsleft << 6) | UB64(ch);
1487 bitsleft += 6;
1488 s++;
1489 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1490 }
1491 }
1492 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001494 s++;
1495 if (s < e && *s == '-') {
1496 s++;
1497 *p++ = '+';
1498 } else
1499 {
1500 inShift = 1;
1501 bitsleft = 0;
1502 }
1503 }
1504 else if (SPECIAL(ch,0,0)) {
1505 errmsg = "unexpected special character";
1506 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001507 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 }
1509 else {
1510 *p++ = ch;
1511 s++;
1512 }
1513 continue;
1514 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001515 outpos = p-PyUnicode_AS_UNICODE(unicode);
1516 endinpos = s-starts;
1517 if (unicode_decode_call_errorhandler(
1518 errors, &errorHandler,
1519 "utf7", errmsg,
1520 starts, size, &startinpos, &endinpos, &exc, &s,
1521 (PyObject **)&unicode, &outpos, &p))
1522 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001523 }
1524
1525 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001526 outpos = p-PyUnicode_AS_UNICODE(unicode);
1527 endinpos = size;
1528 if (unicode_decode_call_errorhandler(
1529 errors, &errorHandler,
1530 "utf7", "unterminated shift sequence",
1531 starts, size, &startinpos, &endinpos, &exc, &s,
1532 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534 if (s < e)
1535 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536 }
1537
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001538 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001539 goto onError;
1540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 Py_XDECREF(errorHandler);
1542 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001543 return (PyObject *)unicode;
1544
1545onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 Py_XDECREF(errorHandler);
1547 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001548 Py_DECREF(unicode);
1549 return NULL;
1550}
1551
1552
1553PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001554 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001555 int encodeSetO,
1556 int encodeWhiteSpace,
1557 const char *errors)
1558{
1559 PyObject *v;
1560 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001561 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001562 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001563 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001564 unsigned int bitsleft = 0;
1565 unsigned long charsleft = 0;
1566 char * out;
1567 char * start;
1568
1569 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001570 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001571
Walter Dörwald51ab4142007-05-05 14:43:36 +00001572 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001573 if (v == NULL)
1574 return NULL;
1575
Walter Dörwald51ab4142007-05-05 14:43:36 +00001576 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001577 for (;i < size; ++i) {
1578 Py_UNICODE ch = s[i];
1579
1580 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001581 if (ch == '+') {
1582 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001583 *out++ = '-';
1584 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1585 charsleft = ch;
1586 bitsleft = 16;
1587 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001588 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001589 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001590 } else {
1591 *out++ = (char) ch;
1592 }
1593 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001594 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1595 *out++ = B64(charsleft << (6-bitsleft));
1596 charsleft = 0;
1597 bitsleft = 0;
1598 /* Characters not in the BASE64 set implicitly unshift the sequence
1599 so no '-' is required, except if the character is itself a '-' */
1600 if (B64CHAR(ch) || ch == '-') {
1601 *out++ = '-';
1602 }
1603 inShift = 0;
1604 *out++ = (char) ch;
1605 } else {
1606 bitsleft += 16;
1607 charsleft = (charsleft << 16) | ch;
1608 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1609
1610 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001611 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001612 or '-' then the shift sequence will be terminated implicitly and we
1613 don't have to insert a '-'. */
1614
1615 if (bitsleft == 0) {
1616 if (i + 1 < size) {
1617 Py_UNICODE ch2 = s[i+1];
1618
1619 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001620
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001621 } else if (B64CHAR(ch2) || ch2 == '-') {
1622 *out++ = '-';
1623 inShift = 0;
1624 } else {
1625 inShift = 0;
1626 }
1627
1628 }
1629 else {
1630 *out++ = '-';
1631 inShift = 0;
1632 }
1633 }
Tim Petersced69f82003-09-16 20:30:58 +00001634 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001636 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637 if (bitsleft) {
1638 *out++= B64(charsleft << (6-bitsleft) );
1639 *out++ = '-';
1640 }
1641
Walter Dörwald51ab4142007-05-05 14:43:36 +00001642 if (PyBytes_Resize(v, out - start)) {
1643 Py_DECREF(v);
1644 return NULL;
1645 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001646 return v;
1647}
1648
1649#undef SPECIAL
1650#undef B64
1651#undef B64CHAR
1652#undef UB64
1653#undef ENCODE
1654#undef DECODE
1655
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656/* --- UTF-8 Codec -------------------------------------------------------- */
1657
Tim Petersced69f82003-09-16 20:30:58 +00001658static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659char utf8_code_length[256] = {
1660 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1661 illegal prefix. see RFC 2279 for details */
1662 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1665 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1666 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1674 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1675 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1676 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1677 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1678};
1679
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001681 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 const char *errors)
1683{
Walter Dörwald69652032004-09-07 20:24:22 +00001684 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1685}
1686
1687PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001688 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001689 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001690 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001691{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001692 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001694 Py_ssize_t startinpos;
1695 Py_ssize_t endinpos;
1696 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697 const char *e;
1698 PyUnicodeObject *unicode;
1699 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001700 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001701 PyObject *errorHandler = NULL;
1702 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703
1704 /* Note: size will always be longer than the resulting Unicode
1705 character count */
1706 unicode = _PyUnicode_New(size);
1707 if (!unicode)
1708 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001709 if (size == 0) {
1710 if (consumed)
1711 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
1715 /* Unpack UTF-8 encoded data */
1716 p = unicode->str;
1717 e = s + size;
1718
1719 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001720 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721
1722 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001723 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724 s++;
1725 continue;
1726 }
1727
1728 n = utf8_code_length[ch];
1729
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001730 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001731 if (consumed)
1732 break;
1733 else {
1734 errmsg = "unexpected end of data";
1735 startinpos = s-starts;
1736 endinpos = size;
1737 goto utf8Error;
1738 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740
1741 switch (n) {
1742
1743 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001744 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001745 startinpos = s-starts;
1746 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001747 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748
1749 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001750 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001751 startinpos = s-starts;
1752 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001753 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001756 if ((s[1] & 0xc0) != 0x80) {
1757 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001758 startinpos = s-starts;
1759 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001760 goto utf8Error;
1761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001763 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001764 startinpos = s-starts;
1765 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001766 errmsg = "illegal encoding";
1767 goto utf8Error;
1768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001770 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 break;
1772
1773 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001774 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001775 (s[2] & 0xc0) != 0x80) {
1776 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001777 startinpos = s-starts;
1778 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001779 goto utf8Error;
1780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001782 if (ch < 0x0800) {
1783 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001784 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001785
1786 XXX For wide builds (UCS-4) we should probably try
1787 to recombine the surrogates into a single code
1788 unit.
1789 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001790 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001791 startinpos = s-starts;
1792 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001793 goto utf8Error;
1794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001796 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001797 break;
1798
1799 case 4:
1800 if ((s[1] & 0xc0) != 0x80 ||
1801 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001802 (s[3] & 0xc0) != 0x80) {
1803 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 startinpos = s-starts;
1805 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001806 goto utf8Error;
1807 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001808 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1809 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1810 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001811 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001812 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001813 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001814 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001815 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001816 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001817 startinpos = s-starts;
1818 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001819 goto utf8Error;
1820 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001821#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001822 *p++ = (Py_UNICODE)ch;
1823#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001824 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001825
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001826 /* translate from 10000..10FFFF to 0..FFFF */
1827 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001828
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001829 /* high surrogate = top 10 bits added to D800 */
1830 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001831
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001832 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001833 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001834#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 break;
1836
1837 default:
1838 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001839 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001840 startinpos = s-starts;
1841 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001842 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843 }
1844 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001845 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001846
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001847 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 outpos = p-PyUnicode_AS_UNICODE(unicode);
1849 if (unicode_decode_call_errorhandler(
1850 errors, &errorHandler,
1851 "utf8", errmsg,
1852 starts, size, &startinpos, &endinpos, &exc, &s,
1853 (PyObject **)&unicode, &outpos, &p))
1854 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855 }
Walter Dörwald69652032004-09-07 20:24:22 +00001856 if (consumed)
1857 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858
1859 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001860 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861 goto onError;
1862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 Py_XDECREF(errorHandler);
1864 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 return (PyObject *)unicode;
1866
1867onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 Py_XDECREF(errorHandler);
1869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 Py_DECREF(unicode);
1871 return NULL;
1872}
1873
Tim Peters602f7402002-04-27 18:03:26 +00001874/* Allocation strategy: if the string is short, convert into a stack buffer
1875 and allocate exactly as much space needed at the end. Else allocate the
1876 maximum possible needed (4 result bytes per Unicode character), and return
1877 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001878*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001879PyObject *
1880PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001881 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883{
Tim Peters602f7402002-04-27 18:03:26 +00001884#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001885
Martin v. Löwis18e16552006-02-15 17:27:45 +00001886 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001887 PyObject *v; /* result string object */
1888 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001889 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001890 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001891 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001892
Tim Peters602f7402002-04-27 18:03:26 +00001893 assert(s != NULL);
1894 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
Tim Peters602f7402002-04-27 18:03:26 +00001896 if (size <= MAX_SHORT_UNICHARS) {
1897 /* Write into the stack buffer; nallocated can't overflow.
1898 * At the end, we'll allocate exactly as much heap space as it
1899 * turns out we need.
1900 */
1901 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1902 v = NULL; /* will allocate after we're done */
1903 p = stackbuf;
1904 }
1905 else {
1906 /* Overallocate on the heap, and give the excess back at the end. */
1907 nallocated = size * 4;
1908 if (nallocated / 4 != size) /* overflow! */
1909 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001910 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001911 if (v == NULL)
1912 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001913 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001914 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001915
Tim Peters602f7402002-04-27 18:03:26 +00001916 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001917 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001918
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001919 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001920 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001922
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001924 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001925 *p++ = (char)(0xc0 | (ch >> 6));
1926 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001927 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001928 else {
Tim Peters602f7402002-04-27 18:03:26 +00001929 /* Encode UCS2 Unicode ordinals */
1930 if (ch < 0x10000) {
1931 /* Special case: check for high surrogate */
1932 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1933 Py_UCS4 ch2 = s[i];
1934 /* Check for low surrogate and combine the two to
1935 form a UCS4 value */
1936 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001937 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001938 i++;
1939 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001940 }
Tim Peters602f7402002-04-27 18:03:26 +00001941 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001942 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001943 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001944 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1945 *p++ = (char)(0x80 | (ch & 0x3f));
1946 continue;
1947 }
1948encodeUCS4:
1949 /* Encode UCS4 Unicode ordinals */
1950 *p++ = (char)(0xf0 | (ch >> 18));
1951 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1952 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1953 *p++ = (char)(0x80 | (ch & 0x3f));
1954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001956
Tim Peters602f7402002-04-27 18:03:26 +00001957 if (v == NULL) {
1958 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001959 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001960 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001961 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001962 }
1963 else {
1964 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001965 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001966 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001967 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001970
Tim Peters602f7402002-04-27 18:03:26 +00001971#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972}
1973
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1975{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 if (!PyUnicode_Check(unicode)) {
1977 PyErr_BadArgument();
1978 return NULL;
1979 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001980 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1981 PyUnicode_GET_SIZE(unicode),
1982 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983}
1984
1985/* --- UTF-16 Codec ------------------------------------------------------- */
1986
Tim Peters772747b2001-08-09 22:21:55 +00001987PyObject *
1988PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001989 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001990 const char *errors,
1991 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992{
Walter Dörwald69652032004-09-07 20:24:22 +00001993 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1994}
1995
1996PyObject *
1997PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001998 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001999 const char *errors,
2000 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002001 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002002{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002003 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002004 Py_ssize_t startinpos;
2005 Py_ssize_t endinpos;
2006 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 PyUnicodeObject *unicode;
2008 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002009 const unsigned char *q, *e;
2010 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002011 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002012 /* Offsets from q for retrieving byte pairs in the right order. */
2013#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2014 int ihi = 1, ilo = 0;
2015#else
2016 int ihi = 0, ilo = 1;
2017#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002018 PyObject *errorHandler = NULL;
2019 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020
2021 /* Note: size will always be longer than the resulting Unicode
2022 character count */
2023 unicode = _PyUnicode_New(size);
2024 if (!unicode)
2025 return NULL;
2026 if (size == 0)
2027 return (PyObject *)unicode;
2028
2029 /* Unpack UTF-16 encoded data */
2030 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002031 q = (unsigned char *)s;
2032 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
2034 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002035 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002037 /* Check for BOM marks (U+FEFF) in the input and adjust current
2038 byte order setting accordingly. In native mode, the leading BOM
2039 mark is skipped, in all other modes, it is copied to the output
2040 stream as-is (giving a ZWNBSP character). */
2041 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002042 if (size >= 2) {
2043 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002044#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002045 if (bom == 0xFEFF) {
2046 q += 2;
2047 bo = -1;
2048 }
2049 else if (bom == 0xFFFE) {
2050 q += 2;
2051 bo = 1;
2052 }
Tim Petersced69f82003-09-16 20:30:58 +00002053#else
Walter Dörwald69652032004-09-07 20:24:22 +00002054 if (bom == 0xFEFF) {
2055 q += 2;
2056 bo = 1;
2057 }
2058 else if (bom == 0xFFFE) {
2059 q += 2;
2060 bo = -1;
2061 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002062#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002063 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065
Tim Peters772747b2001-08-09 22:21:55 +00002066 if (bo == -1) {
2067 /* force LE */
2068 ihi = 1;
2069 ilo = 0;
2070 }
2071 else if (bo == 1) {
2072 /* force BE */
2073 ihi = 0;
2074 ilo = 1;
2075 }
2076
2077 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002078 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002079 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002080 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002081 if (consumed)
2082 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 errmsg = "truncated data";
2084 startinpos = ((const char *)q)-starts;
2085 endinpos = ((const char *)e)-starts;
2086 goto utf16Error;
2087 /* The remaining input chars are ignored if the callback
2088 chooses to skip the input */
2089 }
2090 ch = (q[ihi] << 8) | q[ilo];
2091
Tim Peters772747b2001-08-09 22:21:55 +00002092 q += 2;
2093
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 if (ch < 0xD800 || ch > 0xDFFF) {
2095 *p++ = ch;
2096 continue;
2097 }
2098
2099 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002100 if (q >= e) {
2101 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 startinpos = (((const char *)q)-2)-starts;
2103 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002104 goto utf16Error;
2105 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002106 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002107 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2108 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002109 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002110#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002111 *p++ = ch;
2112 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002113#else
2114 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002115#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002116 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002117 }
2118 else {
2119 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002120 startinpos = (((const char *)q)-4)-starts;
2121 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002122 goto utf16Error;
2123 }
2124
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002126 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 startinpos = (((const char *)q)-2)-starts;
2128 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002129 /* Fall through to report the error */
2130
2131 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 outpos = p-PyUnicode_AS_UNICODE(unicode);
2133 if (unicode_decode_call_errorhandler(
2134 errors, &errorHandler,
2135 "utf16", errmsg,
2136 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2137 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002138 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 }
2140
2141 if (byteorder)
2142 *byteorder = bo;
2143
Walter Dörwald69652032004-09-07 20:24:22 +00002144 if (consumed)
2145 *consumed = (const char *)q-starts;
2146
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002148 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149 goto onError;
2150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151 Py_XDECREF(errorHandler);
2152 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 return (PyObject *)unicode;
2154
2155onError:
2156 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002157 Py_XDECREF(errorHandler);
2158 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 return NULL;
2160}
2161
Tim Peters772747b2001-08-09 22:21:55 +00002162PyObject *
2163PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002164 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002165 const char *errors,
2166 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167{
2168 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002169 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002170#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002171 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002172#else
2173 const int pairs = 0;
2174#endif
Tim Peters772747b2001-08-09 22:21:55 +00002175 /* Offsets from p for storing byte pairs in the right order. */
2176#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2177 int ihi = 1, ilo = 0;
2178#else
2179 int ihi = 0, ilo = 1;
2180#endif
2181
2182#define STORECHAR(CH) \
2183 do { \
2184 p[ihi] = ((CH) >> 8) & 0xff; \
2185 p[ilo] = (CH) & 0xff; \
2186 p += 2; \
2187 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002189#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002190 for (i = pairs = 0; i < size; i++)
2191 if (s[i] >= 0x10000)
2192 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002193#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002194 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002195 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 if (v == NULL)
2197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198
Walter Dörwald3cc34522007-05-04 10:48:27 +00002199 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002201 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002202 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002203 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002204
2205 if (byteorder == -1) {
2206 /* force LE */
2207 ihi = 1;
2208 ilo = 0;
2209 }
2210 else if (byteorder == 1) {
2211 /* force BE */
2212 ihi = 0;
2213 ilo = 1;
2214 }
2215
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002216 while (size-- > 0) {
2217 Py_UNICODE ch = *s++;
2218 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002219#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002220 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002221 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2222 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002224#endif
Tim Peters772747b2001-08-09 22:21:55 +00002225 STORECHAR(ch);
2226 if (ch2)
2227 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002230#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231}
2232
2233PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2234{
2235 if (!PyUnicode_Check(unicode)) {
2236 PyErr_BadArgument();
2237 return NULL;
2238 }
2239 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2240 PyUnicode_GET_SIZE(unicode),
2241 NULL,
2242 0);
2243}
2244
2245/* --- Unicode Escape Codec ----------------------------------------------- */
2246
Fredrik Lundh06d12682001-01-24 07:59:11 +00002247static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002248
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002250 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 const char *errors)
2252{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002253 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002254 Py_ssize_t startinpos;
2255 Py_ssize_t endinpos;
2256 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002261 char* message;
2262 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002263 PyObject *errorHandler = NULL;
2264 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002265
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 /* Escaped strings will always be longer than the resulting
2267 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 length after conversion to the true value.
2269 (but if the error callback returns a long replacement string
2270 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 v = _PyUnicode_New(size);
2272 if (v == NULL)
2273 goto onError;
2274 if (size == 0)
2275 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002276
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002279
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 while (s < end) {
2281 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002282 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002283 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284
2285 /* Non-escape characters are interpreted as Unicode ordinals */
2286 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002287 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288 continue;
2289 }
2290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 /* \ - Escapes */
2293 s++;
2294 switch (*s++) {
2295
2296 /* \x escapes */
2297 case '\n': break;
2298 case '\\': *p++ = '\\'; break;
2299 case '\'': *p++ = '\''; break;
2300 case '\"': *p++ = '\"'; break;
2301 case 'b': *p++ = '\b'; break;
2302 case 'f': *p++ = '\014'; break; /* FF */
2303 case 't': *p++ = '\t'; break;
2304 case 'n': *p++ = '\n'; break;
2305 case 'r': *p++ = '\r'; break;
2306 case 'v': *p++ = '\013'; break; /* VT */
2307 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2308
2309 /* \OOO (octal) escapes */
2310 case '0': case '1': case '2': case '3':
2311 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002312 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002314 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002316 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002317 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002318 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319 break;
2320
Fredrik Lundhccc74732001-02-18 22:13:49 +00002321 /* hex escapes */
2322 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002324 digits = 2;
2325 message = "truncated \\xXX escape";
2326 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327
Fredrik Lundhccc74732001-02-18 22:13:49 +00002328 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002330 digits = 4;
2331 message = "truncated \\uXXXX escape";
2332 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333
Fredrik Lundhccc74732001-02-18 22:13:49 +00002334 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002335 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002336 digits = 8;
2337 message = "truncated \\UXXXXXXXX escape";
2338 hexescape:
2339 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002340 outpos = p-PyUnicode_AS_UNICODE(v);
2341 if (s+digits>end) {
2342 endinpos = size;
2343 if (unicode_decode_call_errorhandler(
2344 errors, &errorHandler,
2345 "unicodeescape", "end of string in escape sequence",
2346 starts, size, &startinpos, &endinpos, &exc, &s,
2347 (PyObject **)&v, &outpos, &p))
2348 goto onError;
2349 goto nextByte;
2350 }
2351 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002352 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002353 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002354 endinpos = (s+i+1)-starts;
2355 if (unicode_decode_call_errorhandler(
2356 errors, &errorHandler,
2357 "unicodeescape", message,
2358 starts, size, &startinpos, &endinpos, &exc, &s,
2359 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002360 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002362 }
2363 chr = (chr<<4) & ~0xF;
2364 if (c >= '0' && c <= '9')
2365 chr += c - '0';
2366 else if (c >= 'a' && c <= 'f')
2367 chr += 10 + c - 'a';
2368 else
2369 chr += 10 + c - 'A';
2370 }
2371 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002372 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 /* _decoding_error will have already written into the
2374 target buffer. */
2375 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002376 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002377 /* when we get here, chr is a 32-bit unicode character */
2378 if (chr <= 0xffff)
2379 /* UCS-2 character */
2380 *p++ = (Py_UNICODE) chr;
2381 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002382 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002383 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002384#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002385 *p++ = chr;
2386#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002387 chr -= 0x10000L;
2388 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002389 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002390#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002391 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002392 endinpos = s-starts;
2393 outpos = p-PyUnicode_AS_UNICODE(v);
2394 if (unicode_decode_call_errorhandler(
2395 errors, &errorHandler,
2396 "unicodeescape", "illegal Unicode character",
2397 starts, size, &startinpos, &endinpos, &exc, &s,
2398 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002399 goto onError;
2400 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002401 break;
2402
2403 /* \N{name} */
2404 case 'N':
2405 message = "malformed \\N character escape";
2406 if (ucnhash_CAPI == NULL) {
2407 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002408 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002409 m = PyImport_ImportModule("unicodedata");
2410 if (m == NULL)
2411 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002412 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002413 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002414 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002415 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002416 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002417 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002418 if (ucnhash_CAPI == NULL)
2419 goto ucnhashError;
2420 }
2421 if (*s == '{') {
2422 const char *start = s+1;
2423 /* look for the closing brace */
2424 while (*s != '}' && s < end)
2425 s++;
2426 if (s > start && s < end && *s == '}') {
2427 /* found a name. look it up in the unicode database */
2428 message = "unknown Unicode character name";
2429 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002430 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002431 goto store;
2432 }
2433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002434 endinpos = s-starts;
2435 outpos = p-PyUnicode_AS_UNICODE(v);
2436 if (unicode_decode_call_errorhandler(
2437 errors, &errorHandler,
2438 "unicodeescape", message,
2439 starts, size, &startinpos, &endinpos, &exc, &s,
2440 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002441 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002442 break;
2443
2444 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002445 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002446 message = "\\ at end of string";
2447 s--;
2448 endinpos = s-starts;
2449 outpos = p-PyUnicode_AS_UNICODE(v);
2450 if (unicode_decode_call_errorhandler(
2451 errors, &errorHandler,
2452 "unicodeescape", message,
2453 starts, size, &startinpos, &endinpos, &exc, &s,
2454 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002455 goto onError;
2456 }
2457 else {
2458 *p++ = '\\';
2459 *p++ = (unsigned char)s[-1];
2460 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002461 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 nextByte:
2464 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002466 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002467 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002468 Py_XDECREF(errorHandler);
2469 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002471
Fredrik Lundhccc74732001-02-18 22:13:49 +00002472ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002473 PyErr_SetString(
2474 PyExc_UnicodeError,
2475 "\\N escapes not supported (can't load unicodedata module)"
2476 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002477 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 Py_XDECREF(errorHandler);
2479 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002480 return NULL;
2481
Fredrik Lundhccc74732001-02-18 22:13:49 +00002482onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002484 Py_XDECREF(errorHandler);
2485 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 return NULL;
2487}
2488
2489/* Return a Unicode-Escape string version of the Unicode object.
2490
2491 If quotes is true, the string is enclosed in u"" or u'' quotes as
2492 appropriate.
2493
2494*/
2495
Thomas Wouters477c8d52006-05-27 19:21:47 +00002496Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2497 Py_ssize_t size,
2498 Py_UNICODE ch)
2499{
2500 /* like wcschr, but doesn't stop at NULL characters */
2501
2502 while (size-- > 0) {
2503 if (*s == ch)
2504 return s;
2505 s++;
2506 }
2507
2508 return NULL;
2509}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002510
Walter Dörwald79e913e2007-05-12 11:08:06 +00002511static const char *hexdigits = "0123456789abcdef";
2512
2513PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2514 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515{
2516 PyObject *repr;
2517 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518
Thomas Wouters89f507f2006-12-13 04:49:30 +00002519 /* XXX(nnorwitz): rather than over-allocating, it would be
2520 better to choose a different scheme. Perhaps scan the
2521 first N-chars of the string and allocate based on that size.
2522 */
2523 /* Initial allocation is based on the longest-possible unichr
2524 escape.
2525
2526 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2527 unichr, so in this case it's the longest unichr escape. In
2528 narrow (UTF-16) builds this is five chars per source unichr
2529 since there are two unichrs in the surrogate pair, so in narrow
2530 (UTF-16) builds it's not the longest unichr escape.
2531
2532 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2533 so in the narrow (UTF-16) build case it's the longest unichr
2534 escape.
2535 */
2536
Walter Dörwald79e913e2007-05-12 11:08:06 +00002537 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002538#ifdef Py_UNICODE_WIDE
2539 + 10*size
2540#else
2541 + 6*size
2542#endif
2543 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 if (repr == NULL)
2545 return NULL;
2546
Walter Dörwald79e913e2007-05-12 11:08:06 +00002547 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002548
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 while (size-- > 0) {
2550 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002551
Walter Dörwald79e913e2007-05-12 11:08:06 +00002552 /* Escape backslashes */
2553 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 *p++ = '\\';
2555 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002556 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002557 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002558
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002559#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002560 /* Map 21-bit characters to '\U00xxxxxx' */
2561 else if (ch >= 0x10000) {
2562 *p++ = '\\';
2563 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002564 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2565 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2566 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2567 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2568 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2569 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2570 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2571 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002572 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002573 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002574#else
2575 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002576 else if (ch >= 0xD800 && ch < 0xDC00) {
2577 Py_UNICODE ch2;
2578 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002579
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002580 ch2 = *s++;
2581 size--;
2582 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2583 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2584 *p++ = '\\';
2585 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002586 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2587 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2588 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2589 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2590 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2591 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2592 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2593 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002594 continue;
2595 }
2596 /* Fall through: isolated surrogates are copied as-is */
2597 s--;
2598 size++;
2599 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002600#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002601
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002603 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 *p++ = '\\';
2605 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002606 *p++ = hexdigits[(ch >> 12) & 0x000F];
2607 *p++ = hexdigits[(ch >> 8) & 0x000F];
2608 *p++ = hexdigits[(ch >> 4) & 0x000F];
2609 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002611
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002612 /* Map special whitespace to '\t', \n', '\r' */
2613 else if (ch == '\t') {
2614 *p++ = '\\';
2615 *p++ = 't';
2616 }
2617 else if (ch == '\n') {
2618 *p++ = '\\';
2619 *p++ = 'n';
2620 }
2621 else if (ch == '\r') {
2622 *p++ = '\\';
2623 *p++ = 'r';
2624 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002625
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002626 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002627 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002629 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002630 *p++ = hexdigits[(ch >> 4) & 0x000F];
2631 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002632 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002633
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 /* Copy everything else as-is */
2635 else
2636 *p++ = (char) ch;
2637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638
2639 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002640 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2641 Py_DECREF(repr);
2642 return NULL;
2643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 return repr;
2645}
2646
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2648{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002649 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 if (!PyUnicode_Check(unicode)) {
2651 PyErr_BadArgument();
2652 return NULL;
2653 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002654 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2655 PyUnicode_GET_SIZE(unicode));
2656
2657 if (!s)
2658 return NULL;
2659 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2660 PyBytes_GET_SIZE(s));
2661 Py_DECREF(s);
2662 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663}
2664
2665/* --- Raw Unicode Escape Codec ------------------------------------------- */
2666
2667PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002668 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669 const char *errors)
2670{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002671 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002672 Py_ssize_t startinpos;
2673 Py_ssize_t endinpos;
2674 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002676 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 const char *end;
2678 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 PyObject *errorHandler = NULL;
2680 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002681
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 /* Escaped strings will always be longer than the resulting
2683 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002684 length after conversion to the true value. (But decoding error
2685 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 v = _PyUnicode_New(size);
2687 if (v == NULL)
2688 goto onError;
2689 if (size == 0)
2690 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002691 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 end = s + size;
2693 while (s < end) {
2694 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002695 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002697 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698
2699 /* Non-escape characters are interpreted as Unicode ordinals */
2700 if (*s != '\\') {
2701 *p++ = (unsigned char)*s++;
2702 continue;
2703 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002704 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705
2706 /* \u-escapes are only interpreted iff the number of leading
2707 backslashes if odd */
2708 bs = s;
2709 for (;s < end;) {
2710 if (*s != '\\')
2711 break;
2712 *p++ = (unsigned char)*s++;
2713 }
2714 if (((s - bs) & 1) == 0 ||
2715 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002716 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 continue;
2718 }
2719 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002720 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 s++;
2722
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002723 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002725 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 endinpos = s-starts;
2729 if (unicode_decode_call_errorhandler(
2730 errors, &errorHandler,
2731 "rawunicodeescape", "truncated \\uXXXX",
2732 starts, size, &startinpos, &endinpos, &exc, &s,
2733 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 }
2737 x = (x<<4) & ~0xF;
2738 if (c >= '0' && c <= '9')
2739 x += c - '0';
2740 else if (c >= 'a' && c <= 'f')
2741 x += 10 + c - 'a';
2742 else
2743 x += 10 + c - 'A';
2744 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002745#ifndef Py_UNICODE_WIDE
2746 if (x > 0x10000) {
2747 if (unicode_decode_call_errorhandler(
2748 errors, &errorHandler,
2749 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2750 starts, size, &startinpos, &endinpos, &exc, &s,
2751 (PyObject **)&v, &outpos, &p))
2752 goto onError;
2753 }
2754#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755 *p++ = x;
2756 nextByte:
2757 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002759 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002760 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002761 Py_XDECREF(errorHandler);
2762 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002764
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 onError:
2766 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 Py_XDECREF(errorHandler);
2768 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 return NULL;
2770}
2771
2772PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002773 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774{
2775 PyObject *repr;
2776 char *p;
2777 char *q;
2778
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002779#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002780 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002781#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002782 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002783#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 if (repr == NULL)
2785 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002786 if (size == 0)
2787 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788
Walter Dörwald711005d2007-05-12 12:03:26 +00002789 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 while (size-- > 0) {
2791 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002792#ifdef Py_UNICODE_WIDE
2793 /* Map 32-bit characters to '\Uxxxxxxxx' */
2794 if (ch >= 0x10000) {
2795 *p++ = '\\';
2796 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002797 *p++ = hexdigits[(ch >> 28) & 0xf];
2798 *p++ = hexdigits[(ch >> 24) & 0xf];
2799 *p++ = hexdigits[(ch >> 20) & 0xf];
2800 *p++ = hexdigits[(ch >> 16) & 0xf];
2801 *p++ = hexdigits[(ch >> 12) & 0xf];
2802 *p++ = hexdigits[(ch >> 8) & 0xf];
2803 *p++ = hexdigits[(ch >> 4) & 0xf];
2804 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002805 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002806 else
2807#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 /* Map 16-bit characters to '\uxxxx' */
2809 if (ch >= 256) {
2810 *p++ = '\\';
2811 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002812 *p++ = hexdigits[(ch >> 12) & 0xf];
2813 *p++ = hexdigits[(ch >> 8) & 0xf];
2814 *p++ = hexdigits[(ch >> 4) & 0xf];
2815 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002816 }
2817 /* Copy everything else as-is */
2818 else
2819 *p++ = (char) ch;
2820 }
2821 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002822 if (PyBytes_Resize(repr, p - q)) {
2823 Py_DECREF(repr);
2824 return NULL;
2825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 return repr;
2827}
2828
2829PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2830{
Walter Dörwald711005d2007-05-12 12:03:26 +00002831 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002833 PyErr_BadArgument();
2834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002836 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2837 PyUnicode_GET_SIZE(unicode));
2838
2839 if (!s)
2840 return NULL;
2841 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2842 PyBytes_GET_SIZE(s));
2843 Py_DECREF(s);
2844 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845}
2846
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002847/* --- Unicode Internal Codec ------------------------------------------- */
2848
2849PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002850 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002851 const char *errors)
2852{
2853 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002854 Py_ssize_t startinpos;
2855 Py_ssize_t endinpos;
2856 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002857 PyUnicodeObject *v;
2858 Py_UNICODE *p;
2859 const char *end;
2860 const char *reason;
2861 PyObject *errorHandler = NULL;
2862 PyObject *exc = NULL;
2863
Neal Norwitzd43069c2006-01-08 01:12:10 +00002864#ifdef Py_UNICODE_WIDE
2865 Py_UNICODE unimax = PyUnicode_GetMax();
2866#endif
2867
Thomas Wouters89f507f2006-12-13 04:49:30 +00002868 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002869 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2870 if (v == NULL)
2871 goto onError;
2872 if (PyUnicode_GetSize((PyObject *)v) == 0)
2873 return (PyObject *)v;
2874 p = PyUnicode_AS_UNICODE(v);
2875 end = s + size;
2876
2877 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002878 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002879 /* We have to sanity check the raw data, otherwise doom looms for
2880 some malformed UCS-4 data. */
2881 if (
2882 #ifdef Py_UNICODE_WIDE
2883 *p > unimax || *p < 0 ||
2884 #endif
2885 end-s < Py_UNICODE_SIZE
2886 )
2887 {
2888 startinpos = s - starts;
2889 if (end-s < Py_UNICODE_SIZE) {
2890 endinpos = end-starts;
2891 reason = "truncated input";
2892 }
2893 else {
2894 endinpos = s - starts + Py_UNICODE_SIZE;
2895 reason = "illegal code point (> 0x10FFFF)";
2896 }
2897 outpos = p - PyUnicode_AS_UNICODE(v);
2898 if (unicode_decode_call_errorhandler(
2899 errors, &errorHandler,
2900 "unicode_internal", reason,
2901 starts, size, &startinpos, &endinpos, &exc, &s,
2902 (PyObject **)&v, &outpos, &p)) {
2903 goto onError;
2904 }
2905 }
2906 else {
2907 p++;
2908 s += Py_UNICODE_SIZE;
2909 }
2910 }
2911
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002912 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002913 goto onError;
2914 Py_XDECREF(errorHandler);
2915 Py_XDECREF(exc);
2916 return (PyObject *)v;
2917
2918 onError:
2919 Py_XDECREF(v);
2920 Py_XDECREF(errorHandler);
2921 Py_XDECREF(exc);
2922 return NULL;
2923}
2924
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925/* --- Latin-1 Codec ------------------------------------------------------ */
2926
2927PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002928 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929 const char *errors)
2930{
2931 PyUnicodeObject *v;
2932 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002933
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002935 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002936 Py_UNICODE r = *(unsigned char*)s;
2937 return PyUnicode_FromUnicode(&r, 1);
2938 }
2939
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940 v = _PyUnicode_New(size);
2941 if (v == NULL)
2942 goto onError;
2943 if (size == 0)
2944 return (PyObject *)v;
2945 p = PyUnicode_AS_UNICODE(v);
2946 while (size-- > 0)
2947 *p++ = (unsigned char)*s++;
2948 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002949
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 onError:
2951 Py_XDECREF(v);
2952 return NULL;
2953}
2954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002955/* create or adjust a UnicodeEncodeError */
2956static void make_encode_exception(PyObject **exceptionObject,
2957 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002958 const Py_UNICODE *unicode, Py_ssize_t size,
2959 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002960 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002962 if (*exceptionObject == NULL) {
2963 *exceptionObject = PyUnicodeEncodeError_Create(
2964 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 }
2966 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002967 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2968 goto onError;
2969 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2970 goto onError;
2971 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2972 goto onError;
2973 return;
2974 onError:
2975 Py_DECREF(*exceptionObject);
2976 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 }
2978}
2979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980/* raises a UnicodeEncodeError */
2981static void raise_encode_exception(PyObject **exceptionObject,
2982 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002983 const Py_UNICODE *unicode, Py_ssize_t size,
2984 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 const char *reason)
2986{
2987 make_encode_exception(exceptionObject,
2988 encoding, unicode, size, startpos, endpos, reason);
2989 if (*exceptionObject != NULL)
2990 PyCodec_StrictErrors(*exceptionObject);
2991}
2992
2993/* error handling callback helper:
2994 build arguments, call the callback and check the arguments,
2995 put the result into newpos and return the replacement string, which
2996 has to be freed by the caller */
2997static PyObject *unicode_encode_call_errorhandler(const char *errors,
2998 PyObject **errorHandler,
2999 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003000 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3001 Py_ssize_t startpos, Py_ssize_t endpos,
3002 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003003{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003004 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003005
3006 PyObject *restuple;
3007 PyObject *resunicode;
3008
3009 if (*errorHandler == NULL) {
3010 *errorHandler = PyCodec_LookupError(errors);
3011 if (*errorHandler == NULL)
3012 return NULL;
3013 }
3014
3015 make_encode_exception(exceptionObject,
3016 encoding, unicode, size, startpos, endpos, reason);
3017 if (*exceptionObject == NULL)
3018 return NULL;
3019
3020 restuple = PyObject_CallFunctionObjArgs(
3021 *errorHandler, *exceptionObject, NULL);
3022 if (restuple == NULL)
3023 return NULL;
3024 if (!PyTuple_Check(restuple)) {
3025 PyErr_Format(PyExc_TypeError, &argparse[4]);
3026 Py_DECREF(restuple);
3027 return NULL;
3028 }
3029 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3030 &resunicode, newpos)) {
3031 Py_DECREF(restuple);
3032 return NULL;
3033 }
3034 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003035 *newpos = size+*newpos;
3036 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003037 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003038 Py_DECREF(restuple);
3039 return NULL;
3040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003041 Py_INCREF(resunicode);
3042 Py_DECREF(restuple);
3043 return resunicode;
3044}
3045
3046static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003047 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 const char *errors,
3049 int limit)
3050{
3051 /* output object */
3052 PyObject *res;
3053 /* pointers to the beginning and end+1 of input */
3054 const Py_UNICODE *startp = p;
3055 const Py_UNICODE *endp = p + size;
3056 /* pointer to the beginning of the unencodable characters */
3057 /* const Py_UNICODE *badp = NULL; */
3058 /* pointer into the output */
3059 char *str;
3060 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003061 Py_ssize_t respos = 0;
3062 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003063 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3064 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065 PyObject *errorHandler = NULL;
3066 PyObject *exc = NULL;
3067 /* the following variable is used for caching string comparisons
3068 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3069 int known_errorHandler = -1;
3070
3071 /* allocate enough for a simple encoding without
3072 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003073 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 if (res == NULL)
3075 goto onError;
3076 if (size == 0)
3077 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003078 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 ressize = size;
3080
3081 while (p<endp) {
3082 Py_UNICODE c = *p;
3083
3084 /* can we encode this? */
3085 if (c<limit) {
3086 /* no overflow check, because we know that the space is enough */
3087 *str++ = (char)c;
3088 ++p;
3089 }
3090 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003091 Py_ssize_t unicodepos = p-startp;
3092 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003094 Py_ssize_t repsize;
3095 Py_ssize_t newpos;
3096 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 Py_UNICODE *uni2;
3098 /* startpos for collecting unencodable chars */
3099 const Py_UNICODE *collstart = p;
3100 const Py_UNICODE *collend = p;
3101 /* find all unecodable characters */
3102 while ((collend < endp) && ((*collend)>=limit))
3103 ++collend;
3104 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3105 if (known_errorHandler==-1) {
3106 if ((errors==NULL) || (!strcmp(errors, "strict")))
3107 known_errorHandler = 1;
3108 else if (!strcmp(errors, "replace"))
3109 known_errorHandler = 2;
3110 else if (!strcmp(errors, "ignore"))
3111 known_errorHandler = 3;
3112 else if (!strcmp(errors, "xmlcharrefreplace"))
3113 known_errorHandler = 4;
3114 else
3115 known_errorHandler = 0;
3116 }
3117 switch (known_errorHandler) {
3118 case 1: /* strict */
3119 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3120 goto onError;
3121 case 2: /* replace */
3122 while (collstart++<collend)
3123 *str++ = '?'; /* fall through */
3124 case 3: /* ignore */
3125 p = collend;
3126 break;
3127 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003128 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003129 /* determine replacement size (temporarily (mis)uses p) */
3130 for (p = collstart, repsize = 0; p < collend; ++p) {
3131 if (*p<10)
3132 repsize += 2+1+1;
3133 else if (*p<100)
3134 repsize += 2+2+1;
3135 else if (*p<1000)
3136 repsize += 2+3+1;
3137 else if (*p<10000)
3138 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003139#ifndef Py_UNICODE_WIDE
3140 else
3141 repsize += 2+5+1;
3142#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003143 else if (*p<100000)
3144 repsize += 2+5+1;
3145 else if (*p<1000000)
3146 repsize += 2+6+1;
3147 else
3148 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003149#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003150 }
3151 requiredsize = respos+repsize+(endp-collend);
3152 if (requiredsize > ressize) {
3153 if (requiredsize<2*ressize)
3154 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003155 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003156 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003157 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 ressize = requiredsize;
3159 }
3160 /* generate replacement (temporarily (mis)uses p) */
3161 for (p = collstart; p < collend; ++p) {
3162 str += sprintf(str, "&#%d;", (int)*p);
3163 }
3164 p = collend;
3165 break;
3166 default:
3167 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3168 encoding, reason, startp, size, &exc,
3169 collstart-startp, collend-startp, &newpos);
3170 if (repunicode == NULL)
3171 goto onError;
3172 /* need more space? (at least enough for what we
3173 have+the replacement+the rest of the string, so
3174 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003175 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003176 repsize = PyUnicode_GET_SIZE(repunicode);
3177 requiredsize = respos+repsize+(endp-collend);
3178 if (requiredsize > ressize) {
3179 if (requiredsize<2*ressize)
3180 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003181 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003182 Py_DECREF(repunicode);
3183 goto onError;
3184 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003185 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186 ressize = requiredsize;
3187 }
3188 /* check if there is anything unencodable in the replacement
3189 and copy it to the output */
3190 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3191 c = *uni2;
3192 if (c >= limit) {
3193 raise_encode_exception(&exc, encoding, startp, size,
3194 unicodepos, unicodepos+1, reason);
3195 Py_DECREF(repunicode);
3196 goto onError;
3197 }
3198 *str = (char)c;
3199 }
3200 p = startp + newpos;
3201 Py_DECREF(repunicode);
3202 }
3203 }
3204 }
3205 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003206 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003207 if (respos<ressize)
3208 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003209 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003210 Py_XDECREF(errorHandler);
3211 Py_XDECREF(exc);
3212 return res;
3213
3214 onError:
3215 Py_XDECREF(res);
3216 Py_XDECREF(errorHandler);
3217 Py_XDECREF(exc);
3218 return NULL;
3219}
3220
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003222 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 const char *errors)
3224{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003225 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226}
3227
3228PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3229{
3230 if (!PyUnicode_Check(unicode)) {
3231 PyErr_BadArgument();
3232 return NULL;
3233 }
3234 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3235 PyUnicode_GET_SIZE(unicode),
3236 NULL);
3237}
3238
3239/* --- 7-bit ASCII Codec -------------------------------------------------- */
3240
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003242 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 const char *errors)
3244{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 PyUnicodeObject *v;
3247 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003248 Py_ssize_t startinpos;
3249 Py_ssize_t endinpos;
3250 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251 const char *e;
3252 PyObject *errorHandler = NULL;
3253 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003254
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003256 if (size == 1 && *(unsigned char*)s < 128) {
3257 Py_UNICODE r = *(unsigned char*)s;
3258 return PyUnicode_FromUnicode(&r, 1);
3259 }
Tim Petersced69f82003-09-16 20:30:58 +00003260
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 v = _PyUnicode_New(size);
3262 if (v == NULL)
3263 goto onError;
3264 if (size == 0)
3265 return (PyObject *)v;
3266 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003267 e = s + size;
3268 while (s < e) {
3269 register unsigned char c = (unsigned char)*s;
3270 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 ++s;
3273 }
3274 else {
3275 startinpos = s-starts;
3276 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003277 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 if (unicode_decode_call_errorhandler(
3279 errors, &errorHandler,
3280 "ascii", "ordinal not in range(128)",
3281 starts, size, &startinpos, &endinpos, &exc, &s,
3282 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003286 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 Py_XDECREF(errorHandler);
3290 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003292
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 onError:
3294 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 Py_XDECREF(errorHandler);
3296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 return NULL;
3298}
3299
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003301 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 const char *errors)
3303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305}
3306
3307PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3308{
3309 if (!PyUnicode_Check(unicode)) {
3310 PyErr_BadArgument();
3311 return NULL;
3312 }
3313 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3314 PyUnicode_GET_SIZE(unicode),
3315 NULL);
3316}
3317
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003318#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003319
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003320/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003321
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003322#if SIZEOF_INT < SIZEOF_SSIZE_T
3323#define NEED_RETRY
3324#endif
3325
3326/* XXX This code is limited to "true" double-byte encodings, as
3327 a) it assumes an incomplete character consists of a single byte, and
3328 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3329 encodings, see IsDBCSLeadByteEx documentation. */
3330
3331static int is_dbcs_lead_byte(const char *s, int offset)
3332{
3333 const char *curr = s + offset;
3334
3335 if (IsDBCSLeadByte(*curr)) {
3336 const char *prev = CharPrev(s, curr);
3337 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3338 }
3339 return 0;
3340}
3341
3342/*
3343 * Decode MBCS string into unicode object. If 'final' is set, converts
3344 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3345 */
3346static int decode_mbcs(PyUnicodeObject **v,
3347 const char *s, /* MBCS string */
3348 int size, /* sizeof MBCS string */
3349 int final)
3350{
3351 Py_UNICODE *p;
3352 Py_ssize_t n = 0;
3353 int usize = 0;
3354
3355 assert(size >= 0);
3356
3357 /* Skip trailing lead-byte unless 'final' is set */
3358 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3359 --size;
3360
3361 /* First get the size of the result */
3362 if (size > 0) {
3363 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3364 if (usize == 0) {
3365 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3366 return -1;
3367 }
3368 }
3369
3370 if (*v == NULL) {
3371 /* Create unicode object */
3372 *v = _PyUnicode_New(usize);
3373 if (*v == NULL)
3374 return -1;
3375 }
3376 else {
3377 /* Extend unicode object */
3378 n = PyUnicode_GET_SIZE(*v);
3379 if (_PyUnicode_Resize(v, n + usize) < 0)
3380 return -1;
3381 }
3382
3383 /* Do the conversion */
3384 if (size > 0) {
3385 p = PyUnicode_AS_UNICODE(*v) + n;
3386 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3387 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3388 return -1;
3389 }
3390 }
3391
3392 return size;
3393}
3394
3395PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3396 Py_ssize_t size,
3397 const char *errors,
3398 Py_ssize_t *consumed)
3399{
3400 PyUnicodeObject *v = NULL;
3401 int done;
3402
3403 if (consumed)
3404 *consumed = 0;
3405
3406#ifdef NEED_RETRY
3407 retry:
3408 if (size > INT_MAX)
3409 done = decode_mbcs(&v, s, INT_MAX, 0);
3410 else
3411#endif
3412 done = decode_mbcs(&v, s, (int)size, !consumed);
3413
3414 if (done < 0) {
3415 Py_XDECREF(v);
3416 return NULL;
3417 }
3418
3419 if (consumed)
3420 *consumed += done;
3421
3422#ifdef NEED_RETRY
3423 if (size > INT_MAX) {
3424 s += done;
3425 size -= done;
3426 goto retry;
3427 }
3428#endif
3429
3430 return (PyObject *)v;
3431}
3432
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003433PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003434 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003435 const char *errors)
3436{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003437 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3438}
3439
3440/*
3441 * Convert unicode into string object (MBCS).
3442 * Returns 0 if succeed, -1 otherwise.
3443 */
3444static int encode_mbcs(PyObject **repr,
3445 const Py_UNICODE *p, /* unicode */
3446 int size) /* size of unicode */
3447{
3448 int mbcssize = 0;
3449 Py_ssize_t n = 0;
3450
3451 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003452
3453 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003454 if (size > 0) {
3455 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3456 if (mbcssize == 0) {
3457 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3458 return -1;
3459 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003460 }
3461
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003462 if (*repr == NULL) {
3463 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003464 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003465 if (*repr == NULL)
3466 return -1;
3467 }
3468 else {
3469 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003470 n = PyBytes_Size(*repr);
3471 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003472 return -1;
3473 }
3474
3475 /* Do the conversion */
3476 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003477 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003478 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3479 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3480 return -1;
3481 }
3482 }
3483
3484 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003485}
3486
3487PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003488 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003489 const char *errors)
3490{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003491 PyObject *repr = NULL;
3492 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003493
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003494#ifdef NEED_RETRY
3495 retry:
3496 if (size > INT_MAX)
3497 ret = encode_mbcs(&repr, p, INT_MAX);
3498 else
3499#endif
3500 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003501
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003502 if (ret < 0) {
3503 Py_XDECREF(repr);
3504 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003505 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003506
3507#ifdef NEED_RETRY
3508 if (size > INT_MAX) {
3509 p += INT_MAX;
3510 size -= INT_MAX;
3511 goto retry;
3512 }
3513#endif
3514
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003515 return repr;
3516}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003517
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003518PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3519{
3520 if (!PyUnicode_Check(unicode)) {
3521 PyErr_BadArgument();
3522 return NULL;
3523 }
3524 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3525 PyUnicode_GET_SIZE(unicode),
3526 NULL);
3527}
3528
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003529#undef NEED_RETRY
3530
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003531#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003532
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533/* --- Character Mapping Codec -------------------------------------------- */
3534
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003536 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 PyObject *mapping,
3538 const char *errors)
3539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003541 Py_ssize_t startinpos;
3542 Py_ssize_t endinpos;
3543 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545 PyUnicodeObject *v;
3546 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003547 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 PyObject *errorHandler = NULL;
3549 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003550 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003551 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003552
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 /* Default to Latin-1 */
3554 if (mapping == NULL)
3555 return PyUnicode_DecodeLatin1(s, size, errors);
3556
3557 v = _PyUnicode_New(size);
3558 if (v == NULL)
3559 goto onError;
3560 if (size == 0)
3561 return (PyObject *)v;
3562 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003564 if (PyUnicode_CheckExact(mapping)) {
3565 mapstring = PyUnicode_AS_UNICODE(mapping);
3566 maplen = PyUnicode_GET_SIZE(mapping);
3567 while (s < e) {
3568 unsigned char ch = *s;
3569 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003571 if (ch < maplen)
3572 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003574 if (x == 0xfffe) {
3575 /* undefined mapping */
3576 outpos = p-PyUnicode_AS_UNICODE(v);
3577 startinpos = s-starts;
3578 endinpos = startinpos+1;
3579 if (unicode_decode_call_errorhandler(
3580 errors, &errorHandler,
3581 "charmap", "character maps to <undefined>",
3582 starts, size, &startinpos, &endinpos, &exc, &s,
3583 (PyObject **)&v, &outpos, &p)) {
3584 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003585 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003586 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003587 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003588 *p++ = x;
3589 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003591 }
3592 else {
3593 while (s < e) {
3594 unsigned char ch = *s;
3595 PyObject *w, *x;
3596
3597 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3598 w = PyInt_FromLong((long)ch);
3599 if (w == NULL)
3600 goto onError;
3601 x = PyObject_GetItem(mapping, w);
3602 Py_DECREF(w);
3603 if (x == NULL) {
3604 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3605 /* No mapping found means: mapping is undefined. */
3606 PyErr_Clear();
3607 x = Py_None;
3608 Py_INCREF(x);
3609 } else
3610 goto onError;
3611 }
3612
3613 /* Apply mapping */
3614 if (PyInt_Check(x)) {
3615 long value = PyInt_AS_LONG(x);
3616 if (value < 0 || value > 65535) {
3617 PyErr_SetString(PyExc_TypeError,
3618 "character mapping must be in range(65536)");
3619 Py_DECREF(x);
3620 goto onError;
3621 }
3622 *p++ = (Py_UNICODE)value;
3623 }
3624 else if (x == Py_None) {
3625 /* undefined mapping */
3626 outpos = p-PyUnicode_AS_UNICODE(v);
3627 startinpos = s-starts;
3628 endinpos = startinpos+1;
3629 if (unicode_decode_call_errorhandler(
3630 errors, &errorHandler,
3631 "charmap", "character maps to <undefined>",
3632 starts, size, &startinpos, &endinpos, &exc, &s,
3633 (PyObject **)&v, &outpos, &p)) {
3634 Py_DECREF(x);
3635 goto onError;
3636 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003637 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003638 continue;
3639 }
3640 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003641 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003642
3643 if (targetsize == 1)
3644 /* 1-1 mapping */
3645 *p++ = *PyUnicode_AS_UNICODE(x);
3646
3647 else if (targetsize > 1) {
3648 /* 1-n mapping */
3649 if (targetsize > extrachars) {
3650 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003651 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3652 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003653 (targetsize << 2);
3654 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003655 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003656 if (_PyUnicode_Resize(&v,
3657 PyUnicode_GET_SIZE(v) + needed) < 0) {
3658 Py_DECREF(x);
3659 goto onError;
3660 }
3661 p = PyUnicode_AS_UNICODE(v) + oldpos;
3662 }
3663 Py_UNICODE_COPY(p,
3664 PyUnicode_AS_UNICODE(x),
3665 targetsize);
3666 p += targetsize;
3667 extrachars -= targetsize;
3668 }
3669 /* 1-0 mapping: skip the character */
3670 }
3671 else {
3672 /* wrong return value */
3673 PyErr_SetString(PyExc_TypeError,
3674 "character mapping must return integer, None or unicode");
3675 Py_DECREF(x);
3676 goto onError;
3677 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003679 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 }
3682 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003683 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 Py_XDECREF(errorHandler);
3686 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003688
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 Py_XDECREF(errorHandler);
3691 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 Py_XDECREF(v);
3693 return NULL;
3694}
3695
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003696/* Charmap encoding: the lookup table */
3697
3698struct encoding_map{
3699 PyObject_HEAD
3700 unsigned char level1[32];
3701 int count2, count3;
3702 unsigned char level23[1];
3703};
3704
3705static PyObject*
3706encoding_map_size(PyObject *obj, PyObject* args)
3707{
3708 struct encoding_map *map = (struct encoding_map*)obj;
3709 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3710 128*map->count3);
3711}
3712
3713static PyMethodDef encoding_map_methods[] = {
3714 {"size", encoding_map_size, METH_NOARGS,
3715 PyDoc_STR("Return the size (in bytes) of this object") },
3716 { 0 }
3717};
3718
3719static void
3720encoding_map_dealloc(PyObject* o)
3721{
3722 PyObject_FREE(o);
3723}
3724
3725static PyTypeObject EncodingMapType = {
3726 PyObject_HEAD_INIT(NULL)
3727 0, /*ob_size*/
3728 "EncodingMap", /*tp_name*/
3729 sizeof(struct encoding_map), /*tp_basicsize*/
3730 0, /*tp_itemsize*/
3731 /* methods */
3732 encoding_map_dealloc, /*tp_dealloc*/
3733 0, /*tp_print*/
3734 0, /*tp_getattr*/
3735 0, /*tp_setattr*/
3736 0, /*tp_compare*/
3737 0, /*tp_repr*/
3738 0, /*tp_as_number*/
3739 0, /*tp_as_sequence*/
3740 0, /*tp_as_mapping*/
3741 0, /*tp_hash*/
3742 0, /*tp_call*/
3743 0, /*tp_str*/
3744 0, /*tp_getattro*/
3745 0, /*tp_setattro*/
3746 0, /*tp_as_buffer*/
3747 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3748 0, /*tp_doc*/
3749 0, /*tp_traverse*/
3750 0, /*tp_clear*/
3751 0, /*tp_richcompare*/
3752 0, /*tp_weaklistoffset*/
3753 0, /*tp_iter*/
3754 0, /*tp_iternext*/
3755 encoding_map_methods, /*tp_methods*/
3756 0, /*tp_members*/
3757 0, /*tp_getset*/
3758 0, /*tp_base*/
3759 0, /*tp_dict*/
3760 0, /*tp_descr_get*/
3761 0, /*tp_descr_set*/
3762 0, /*tp_dictoffset*/
3763 0, /*tp_init*/
3764 0, /*tp_alloc*/
3765 0, /*tp_new*/
3766 0, /*tp_free*/
3767 0, /*tp_is_gc*/
3768};
3769
3770PyObject*
3771PyUnicode_BuildEncodingMap(PyObject* string)
3772{
3773 Py_UNICODE *decode;
3774 PyObject *result;
3775 struct encoding_map *mresult;
3776 int i;
3777 int need_dict = 0;
3778 unsigned char level1[32];
3779 unsigned char level2[512];
3780 unsigned char *mlevel1, *mlevel2, *mlevel3;
3781 int count2 = 0, count3 = 0;
3782
3783 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3784 PyErr_BadArgument();
3785 return NULL;
3786 }
3787 decode = PyUnicode_AS_UNICODE(string);
3788 memset(level1, 0xFF, sizeof level1);
3789 memset(level2, 0xFF, sizeof level2);
3790
3791 /* If there isn't a one-to-one mapping of NULL to \0,
3792 or if there are non-BMP characters, we need to use
3793 a mapping dictionary. */
3794 if (decode[0] != 0)
3795 need_dict = 1;
3796 for (i = 1; i < 256; i++) {
3797 int l1, l2;
3798 if (decode[i] == 0
3799 #ifdef Py_UNICODE_WIDE
3800 || decode[i] > 0xFFFF
3801 #endif
3802 ) {
3803 need_dict = 1;
3804 break;
3805 }
3806 if (decode[i] == 0xFFFE)
3807 /* unmapped character */
3808 continue;
3809 l1 = decode[i] >> 11;
3810 l2 = decode[i] >> 7;
3811 if (level1[l1] == 0xFF)
3812 level1[l1] = count2++;
3813 if (level2[l2] == 0xFF)
3814 level2[l2] = count3++;
3815 }
3816
3817 if (count2 >= 0xFF || count3 >= 0xFF)
3818 need_dict = 1;
3819
3820 if (need_dict) {
3821 PyObject *result = PyDict_New();
3822 PyObject *key, *value;
3823 if (!result)
3824 return NULL;
3825 for (i = 0; i < 256; i++) {
3826 key = value = NULL;
3827 key = PyInt_FromLong(decode[i]);
3828 value = PyInt_FromLong(i);
3829 if (!key || !value)
3830 goto failed1;
3831 if (PyDict_SetItem(result, key, value) == -1)
3832 goto failed1;
3833 Py_DECREF(key);
3834 Py_DECREF(value);
3835 }
3836 return result;
3837 failed1:
3838 Py_XDECREF(key);
3839 Py_XDECREF(value);
3840 Py_DECREF(result);
3841 return NULL;
3842 }
3843
3844 /* Create a three-level trie */
3845 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3846 16*count2 + 128*count3 - 1);
3847 if (!result)
3848 return PyErr_NoMemory();
3849 PyObject_Init(result, &EncodingMapType);
3850 mresult = (struct encoding_map*)result;
3851 mresult->count2 = count2;
3852 mresult->count3 = count3;
3853 mlevel1 = mresult->level1;
3854 mlevel2 = mresult->level23;
3855 mlevel3 = mresult->level23 + 16*count2;
3856 memcpy(mlevel1, level1, 32);
3857 memset(mlevel2, 0xFF, 16*count2);
3858 memset(mlevel3, 0, 128*count3);
3859 count3 = 0;
3860 for (i = 1; i < 256; i++) {
3861 int o1, o2, o3, i2, i3;
3862 if (decode[i] == 0xFFFE)
3863 /* unmapped character */
3864 continue;
3865 o1 = decode[i]>>11;
3866 o2 = (decode[i]>>7) & 0xF;
3867 i2 = 16*mlevel1[o1] + o2;
3868 if (mlevel2[i2] == 0xFF)
3869 mlevel2[i2] = count3++;
3870 o3 = decode[i] & 0x7F;
3871 i3 = 128*mlevel2[i2] + o3;
3872 mlevel3[i3] = i;
3873 }
3874 return result;
3875}
3876
3877static int
3878encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3879{
3880 struct encoding_map *map = (struct encoding_map*)mapping;
3881 int l1 = c>>11;
3882 int l2 = (c>>7) & 0xF;
3883 int l3 = c & 0x7F;
3884 int i;
3885
3886#ifdef Py_UNICODE_WIDE
3887 if (c > 0xFFFF) {
3888 return -1;
3889 }
3890#endif
3891 if (c == 0)
3892 return 0;
3893 /* level 1*/
3894 i = map->level1[l1];
3895 if (i == 0xFF) {
3896 return -1;
3897 }
3898 /* level 2*/
3899 i = map->level23[16*i+l2];
3900 if (i == 0xFF) {
3901 return -1;
3902 }
3903 /* level 3 */
3904 i = map->level23[16*map->count2 + 128*i + l3];
3905 if (i == 0) {
3906 return -1;
3907 }
3908 return i;
3909}
3910
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911/* Lookup the character ch in the mapping. If the character
3912 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003913 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 PyObject *w = PyInt_FromLong((long)c);
3917 PyObject *x;
3918
3919 if (w == NULL)
3920 return NULL;
3921 x = PyObject_GetItem(mapping, w);
3922 Py_DECREF(w);
3923 if (x == NULL) {
3924 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3925 /* No mapping found means: mapping is undefined. */
3926 PyErr_Clear();
3927 x = Py_None;
3928 Py_INCREF(x);
3929 return x;
3930 } else
3931 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003933 else if (x == Py_None)
3934 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 else if (PyInt_Check(x)) {
3936 long value = PyInt_AS_LONG(x);
3937 if (value < 0 || value > 255) {
3938 PyErr_SetString(PyExc_TypeError,
3939 "character mapping must be in range(256)");
3940 Py_DECREF(x);
3941 return NULL;
3942 }
3943 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 else if (PyString_Check(x))
3946 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003949 PyErr_Format(PyExc_TypeError,
3950 "character mapping must return integer, None or str8, not %.400s",
3951 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 Py_DECREF(x);
3953 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 }
3955}
3956
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003957static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003958charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003959{
Walter Dörwald827b0552007-05-12 13:23:53 +00003960 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003961 /* exponentially overallocate to minimize reallocations */
3962 if (requiredsize < 2*outsize)
3963 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003964 if (PyBytes_Resize(outobj, requiredsize)) {
3965 Py_DECREF(outobj);
3966 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003967 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003968 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003969}
3970
3971typedef enum charmapencode_result {
3972 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3973}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003975 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 space is available. Return a new reference to the object that
3977 was put in the output buffer, or Py_None, if the mapping was undefined
3978 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003979 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003981charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003982 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003983{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003984 PyObject *rep;
3985 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003986 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003988 if (mapping->ob_type == &EncodingMapType) {
3989 int res = encoding_map_lookup(c, mapping);
3990 Py_ssize_t requiredsize = *outpos+1;
3991 if (res == -1)
3992 return enc_FAILED;
3993 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003994 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003995 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003996 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003997 outstart[(*outpos)++] = (char)res;
3998 return enc_SUCCESS;
3999 }
4000
4001 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004003 return enc_EXCEPTION;
4004 else if (rep==Py_None) {
4005 Py_DECREF(rep);
4006 return enc_FAILED;
4007 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004009 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004010 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004011 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004013 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004015 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4017 }
4018 else {
4019 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004020 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4021 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004022 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004023 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004025 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004027 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 memcpy(outstart + *outpos, repchars, repsize);
4029 *outpos += repsize;
4030 }
4031 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004032 Py_DECREF(rep);
4033 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034}
4035
4036/* handle an error in PyUnicode_EncodeCharmap
4037 Return 0 on success, -1 on error */
4038static
4039int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004040 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004042 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004043 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044{
4045 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004046 Py_ssize_t repsize;
4047 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 Py_UNICODE *uni2;
4049 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004050 Py_ssize_t collstartpos = *inpos;
4051 Py_ssize_t collendpos = *inpos+1;
4052 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 char *encoding = "charmap";
4054 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004055 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 /* find all unencodable characters */
4058 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004059 PyObject *rep;
4060 if (mapping->ob_type == &EncodingMapType) {
4061 int res = encoding_map_lookup(p[collendpos], mapping);
4062 if (res != -1)
4063 break;
4064 ++collendpos;
4065 continue;
4066 }
4067
4068 rep = charmapencode_lookup(p[collendpos], mapping);
4069 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004071 else if (rep!=Py_None) {
4072 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073 break;
4074 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004075 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076 ++collendpos;
4077 }
4078 /* cache callback name lookup
4079 * (if not done yet, i.e. it's the first error) */
4080 if (*known_errorHandler==-1) {
4081 if ((errors==NULL) || (!strcmp(errors, "strict")))
4082 *known_errorHandler = 1;
4083 else if (!strcmp(errors, "replace"))
4084 *known_errorHandler = 2;
4085 else if (!strcmp(errors, "ignore"))
4086 *known_errorHandler = 3;
4087 else if (!strcmp(errors, "xmlcharrefreplace"))
4088 *known_errorHandler = 4;
4089 else
4090 *known_errorHandler = 0;
4091 }
4092 switch (*known_errorHandler) {
4093 case 1: /* strict */
4094 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4095 return -1;
4096 case 2: /* replace */
4097 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4098 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004099 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100 return -1;
4101 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004102 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4104 return -1;
4105 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 }
4107 /* fall through */
4108 case 3: /* ignore */
4109 *inpos = collendpos;
4110 break;
4111 case 4: /* xmlcharrefreplace */
4112 /* generate replacement (temporarily (mis)uses p) */
4113 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4114 char buffer[2+29+1+1];
4115 char *cp;
4116 sprintf(buffer, "&#%d;", (int)p[collpos]);
4117 for (cp = buffer; *cp; ++cp) {
4118 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004119 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004121 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4123 return -1;
4124 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 }
4126 }
4127 *inpos = collendpos;
4128 break;
4129 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004130 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 encoding, reason, p, size, exceptionObject,
4132 collstartpos, collendpos, &newpos);
4133 if (repunicode == NULL)
4134 return -1;
4135 /* generate replacement */
4136 repsize = PyUnicode_GET_SIZE(repunicode);
4137 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4138 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004139 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004140 return -1;
4141 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004142 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4145 return -1;
4146 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 }
4148 *inpos = newpos;
4149 Py_DECREF(repunicode);
4150 }
4151 return 0;
4152}
4153
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004155 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 PyObject *mapping,
4157 const char *errors)
4158{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 /* output object */
4160 PyObject *res = NULL;
4161 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004164 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 PyObject *errorHandler = NULL;
4166 PyObject *exc = NULL;
4167 /* the following variable is used for caching string comparisons
4168 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4169 * 3=ignore, 4=xmlcharrefreplace */
4170 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171
4172 /* Default to Latin-1 */
4173 if (mapping == NULL)
4174 return PyUnicode_EncodeLatin1(p, size, errors);
4175
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 /* allocate enough for a simple encoding without
4177 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004178 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 if (res == NULL)
4180 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004181 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 while (inpos<size) {
4185 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004186 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004187 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004189 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (charmap_encoding_error(p, size, &inpos, mapping,
4191 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004192 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004193 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004194 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 else
4198 /* done with this character => adjust input position */
4199 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004203 if (respos<PyBytes_GET_SIZE(res)) {
4204 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 goto onError;
4206 }
4207 Py_XDECREF(exc);
4208 Py_XDECREF(errorHandler);
4209 return res;
4210
4211 onError:
4212 Py_XDECREF(res);
4213 Py_XDECREF(exc);
4214 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215 return NULL;
4216}
4217
4218PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4219 PyObject *mapping)
4220{
4221 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4222 PyErr_BadArgument();
4223 return NULL;
4224 }
4225 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4226 PyUnicode_GET_SIZE(unicode),
4227 mapping,
4228 NULL);
4229}
4230
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231/* create or adjust a UnicodeTranslateError */
4232static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004233 const Py_UNICODE *unicode, Py_ssize_t size,
4234 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237 if (*exceptionObject == NULL) {
4238 *exceptionObject = PyUnicodeTranslateError_Create(
4239 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240 }
4241 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4243 goto onError;
4244 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4245 goto onError;
4246 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4247 goto onError;
4248 return;
4249 onError:
4250 Py_DECREF(*exceptionObject);
4251 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252 }
4253}
4254
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255/* raises a UnicodeTranslateError */
4256static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004257 const Py_UNICODE *unicode, Py_ssize_t size,
4258 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 const char *reason)
4260{
4261 make_translate_exception(exceptionObject,
4262 unicode, size, startpos, endpos, reason);
4263 if (*exceptionObject != NULL)
4264 PyCodec_StrictErrors(*exceptionObject);
4265}
4266
4267/* error handling callback helper:
4268 build arguments, call the callback and check the arguments,
4269 put the result into newpos and return the replacement string, which
4270 has to be freed by the caller */
4271static PyObject *unicode_translate_call_errorhandler(const char *errors,
4272 PyObject **errorHandler,
4273 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4275 Py_ssize_t startpos, Py_ssize_t endpos,
4276 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004278 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004280 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 PyObject *restuple;
4282 PyObject *resunicode;
4283
4284 if (*errorHandler == NULL) {
4285 *errorHandler = PyCodec_LookupError(errors);
4286 if (*errorHandler == NULL)
4287 return NULL;
4288 }
4289
4290 make_translate_exception(exceptionObject,
4291 unicode, size, startpos, endpos, reason);
4292 if (*exceptionObject == NULL)
4293 return NULL;
4294
4295 restuple = PyObject_CallFunctionObjArgs(
4296 *errorHandler, *exceptionObject, NULL);
4297 if (restuple == NULL)
4298 return NULL;
4299 if (!PyTuple_Check(restuple)) {
4300 PyErr_Format(PyExc_TypeError, &argparse[4]);
4301 Py_DECREF(restuple);
4302 return NULL;
4303 }
4304 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004305 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 Py_DECREF(restuple);
4307 return NULL;
4308 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004309 if (i_newpos<0)
4310 *newpos = size+i_newpos;
4311 else
4312 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004313 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004314 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004315 Py_DECREF(restuple);
4316 return NULL;
4317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318 Py_INCREF(resunicode);
4319 Py_DECREF(restuple);
4320 return resunicode;
4321}
4322
4323/* Lookup the character ch in the mapping and put the result in result,
4324 which must be decrefed by the caller.
4325 Return 0 on success, -1 on error */
4326static
4327int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4328{
4329 PyObject *w = PyInt_FromLong((long)c);
4330 PyObject *x;
4331
4332 if (w == NULL)
4333 return -1;
4334 x = PyObject_GetItem(mapping, w);
4335 Py_DECREF(w);
4336 if (x == NULL) {
4337 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4338 /* No mapping found means: use 1:1 mapping. */
4339 PyErr_Clear();
4340 *result = NULL;
4341 return 0;
4342 } else
4343 return -1;
4344 }
4345 else if (x == Py_None) {
4346 *result = x;
4347 return 0;
4348 }
4349 else if (PyInt_Check(x)) {
4350 long value = PyInt_AS_LONG(x);
4351 long max = PyUnicode_GetMax();
4352 if (value < 0 || value > max) {
4353 PyErr_Format(PyExc_TypeError,
4354 "character mapping must be in range(0x%lx)", max+1);
4355 Py_DECREF(x);
4356 return -1;
4357 }
4358 *result = x;
4359 return 0;
4360 }
4361 else if (PyUnicode_Check(x)) {
4362 *result = x;
4363 return 0;
4364 }
4365 else {
4366 /* wrong return value */
4367 PyErr_SetString(PyExc_TypeError,
4368 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004369 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 return -1;
4371 }
4372}
4373/* ensure that *outobj is at least requiredsize characters long,
4374if not reallocate and adjust various state variables.
4375Return 0 on success, -1 on error */
4376static
Walter Dörwald4894c302003-10-24 14:25:28 +00004377int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004378 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004380 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004381 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004383 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004385 if (requiredsize < 2 * oldsize)
4386 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004387 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 return -1;
4389 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 }
4391 return 0;
4392}
4393/* lookup the character, put the result in the output string and adjust
4394 various state variables. Return a new reference to the object that
4395 was put in the output buffer in *result, or Py_None, if the mapping was
4396 undefined (in which case no character was written).
4397 The called must decref result.
4398 Return 0 on success, -1 on error. */
4399static
Walter Dörwald4894c302003-10-24 14:25:28 +00004400int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004401 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004402 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403{
Walter Dörwald4894c302003-10-24 14:25:28 +00004404 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 return -1;
4406 if (*res==NULL) {
4407 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004408 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 }
4410 else if (*res==Py_None)
4411 ;
4412 else if (PyInt_Check(*res)) {
4413 /* no overflow check, because we know that the space is enough */
4414 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4415 }
4416 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004417 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 if (repsize==1) {
4419 /* no overflow check, because we know that the space is enough */
4420 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4421 }
4422 else if (repsize!=0) {
4423 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004424 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004425 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004426 repsize - 1;
4427 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 return -1;
4429 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4430 *outp += repsize;
4431 }
4432 }
4433 else
4434 return -1;
4435 return 0;
4436}
4437
4438PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004439 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 PyObject *mapping,
4441 const char *errors)
4442{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 /* output object */
4444 PyObject *res = NULL;
4445 /* pointers to the beginning and end+1 of input */
4446 const Py_UNICODE *startp = p;
4447 const Py_UNICODE *endp = p + size;
4448 /* pointer into the output */
4449 Py_UNICODE *str;
4450 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004451 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 char *reason = "character maps to <undefined>";
4453 PyObject *errorHandler = NULL;
4454 PyObject *exc = NULL;
4455 /* the following variable is used for caching string comparisons
4456 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4457 * 3=ignore, 4=xmlcharrefreplace */
4458 int known_errorHandler = -1;
4459
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 if (mapping == NULL) {
4461 PyErr_BadArgument();
4462 return NULL;
4463 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464
4465 /* allocate enough for a simple 1:1 translation without
4466 replacements, if we need more, we'll resize */
4467 res = PyUnicode_FromUnicode(NULL, size);
4468 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004469 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 return res;
4472 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474 while (p<endp) {
4475 /* try to encode it */
4476 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004477 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 goto onError;
4480 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004481 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 if (x!=Py_None) /* it worked => adjust input pointer */
4483 ++p;
4484 else { /* untranslatable character */
4485 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004486 Py_ssize_t repsize;
4487 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 Py_UNICODE *uni2;
4489 /* startpos for collecting untranslatable chars */
4490 const Py_UNICODE *collstart = p;
4491 const Py_UNICODE *collend = p+1;
4492 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 /* find all untranslatable characters */
4495 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004496 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004497 goto onError;
4498 Py_XDECREF(x);
4499 if (x!=Py_None)
4500 break;
4501 ++collend;
4502 }
4503 /* cache callback name lookup
4504 * (if not done yet, i.e. it's the first error) */
4505 if (known_errorHandler==-1) {
4506 if ((errors==NULL) || (!strcmp(errors, "strict")))
4507 known_errorHandler = 1;
4508 else if (!strcmp(errors, "replace"))
4509 known_errorHandler = 2;
4510 else if (!strcmp(errors, "ignore"))
4511 known_errorHandler = 3;
4512 else if (!strcmp(errors, "xmlcharrefreplace"))
4513 known_errorHandler = 4;
4514 else
4515 known_errorHandler = 0;
4516 }
4517 switch (known_errorHandler) {
4518 case 1: /* strict */
4519 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4520 goto onError;
4521 case 2: /* replace */
4522 /* No need to check for space, this is a 1:1 replacement */
4523 for (coll = collstart; coll<collend; ++coll)
4524 *str++ = '?';
4525 /* fall through */
4526 case 3: /* ignore */
4527 p = collend;
4528 break;
4529 case 4: /* xmlcharrefreplace */
4530 /* generate replacement (temporarily (mis)uses p) */
4531 for (p = collstart; p < collend; ++p) {
4532 char buffer[2+29+1+1];
4533 char *cp;
4534 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004535 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4537 goto onError;
4538 for (cp = buffer; *cp; ++cp)
4539 *str++ = *cp;
4540 }
4541 p = collend;
4542 break;
4543 default:
4544 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4545 reason, startp, size, &exc,
4546 collstart-startp, collend-startp, &newpos);
4547 if (repunicode == NULL)
4548 goto onError;
4549 /* generate replacement */
4550 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004551 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4553 Py_DECREF(repunicode);
4554 goto onError;
4555 }
4556 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4557 *str++ = *uni2;
4558 p = startp + newpos;
4559 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 }
4561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 /* Resize if we allocated to much */
4564 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004565 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004566 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004567 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 }
4569 Py_XDECREF(exc);
4570 Py_XDECREF(errorHandler);
4571 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 onError:
4574 Py_XDECREF(res);
4575 Py_XDECREF(exc);
4576 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577 return NULL;
4578}
4579
4580PyObject *PyUnicode_Translate(PyObject *str,
4581 PyObject *mapping,
4582 const char *errors)
4583{
4584 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004585
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586 str = PyUnicode_FromObject(str);
4587 if (str == NULL)
4588 goto onError;
4589 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4590 PyUnicode_GET_SIZE(str),
4591 mapping,
4592 errors);
4593 Py_DECREF(str);
4594 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004595
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 onError:
4597 Py_XDECREF(str);
4598 return NULL;
4599}
Tim Petersced69f82003-09-16 20:30:58 +00004600
Guido van Rossum9e896b32000-04-05 20:11:21 +00004601/* --- Decimal Encoder ---------------------------------------------------- */
4602
4603int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004604 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004605 char *output,
4606 const char *errors)
4607{
4608 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 PyObject *errorHandler = NULL;
4610 PyObject *exc = NULL;
4611 const char *encoding = "decimal";
4612 const char *reason = "invalid decimal Unicode string";
4613 /* the following variable is used for caching string comparisons
4614 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4615 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004616
4617 if (output == NULL) {
4618 PyErr_BadArgument();
4619 return -1;
4620 }
4621
4622 p = s;
4623 end = s + length;
4624 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004626 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004628 Py_ssize_t repsize;
4629 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630 Py_UNICODE *uni2;
4631 Py_UNICODE *collstart;
4632 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004633
Guido van Rossum9e896b32000-04-05 20:11:21 +00004634 if (Py_UNICODE_ISSPACE(ch)) {
4635 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004637 continue;
4638 }
4639 decimal = Py_UNICODE_TODECIMAL(ch);
4640 if (decimal >= 0) {
4641 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004643 continue;
4644 }
Guido van Rossumba477042000-04-06 18:18:10 +00004645 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004646 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004648 continue;
4649 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 /* All other characters are considered unencodable */
4651 collstart = p;
4652 collend = p+1;
4653 while (collend < end) {
4654 if ((0 < *collend && *collend < 256) ||
4655 !Py_UNICODE_ISSPACE(*collend) ||
4656 Py_UNICODE_TODECIMAL(*collend))
4657 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004658 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 /* cache callback name lookup
4660 * (if not done yet, i.e. it's the first error) */
4661 if (known_errorHandler==-1) {
4662 if ((errors==NULL) || (!strcmp(errors, "strict")))
4663 known_errorHandler = 1;
4664 else if (!strcmp(errors, "replace"))
4665 known_errorHandler = 2;
4666 else if (!strcmp(errors, "ignore"))
4667 known_errorHandler = 3;
4668 else if (!strcmp(errors, "xmlcharrefreplace"))
4669 known_errorHandler = 4;
4670 else
4671 known_errorHandler = 0;
4672 }
4673 switch (known_errorHandler) {
4674 case 1: /* strict */
4675 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4676 goto onError;
4677 case 2: /* replace */
4678 for (p = collstart; p < collend; ++p)
4679 *output++ = '?';
4680 /* fall through */
4681 case 3: /* ignore */
4682 p = collend;
4683 break;
4684 case 4: /* xmlcharrefreplace */
4685 /* generate replacement (temporarily (mis)uses p) */
4686 for (p = collstart; p < collend; ++p)
4687 output += sprintf(output, "&#%d;", (int)*p);
4688 p = collend;
4689 break;
4690 default:
4691 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4692 encoding, reason, s, length, &exc,
4693 collstart-s, collend-s, &newpos);
4694 if (repunicode == NULL)
4695 goto onError;
4696 /* generate replacement */
4697 repsize = PyUnicode_GET_SIZE(repunicode);
4698 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4699 Py_UNICODE ch = *uni2;
4700 if (Py_UNICODE_ISSPACE(ch))
4701 *output++ = ' ';
4702 else {
4703 decimal = Py_UNICODE_TODECIMAL(ch);
4704 if (decimal >= 0)
4705 *output++ = '0' + decimal;
4706 else if (0 < ch && ch < 256)
4707 *output++ = (char)ch;
4708 else {
4709 Py_DECREF(repunicode);
4710 raise_encode_exception(&exc, encoding,
4711 s, length, collstart-s, collend-s, reason);
4712 goto onError;
4713 }
4714 }
4715 }
4716 p = s + newpos;
4717 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004718 }
4719 }
4720 /* 0-terminate the output string */
4721 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 Py_XDECREF(exc);
4723 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004724 return 0;
4725
4726 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 Py_XDECREF(exc);
4728 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004729 return -1;
4730}
4731
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732/* --- Helpers ------------------------------------------------------------ */
4733
Thomas Wouters477c8d52006-05-27 19:21:47 +00004734#define STRINGLIB_CHAR Py_UNICODE
4735
4736#define STRINGLIB_LEN PyUnicode_GET_SIZE
4737#define STRINGLIB_NEW PyUnicode_FromUnicode
4738#define STRINGLIB_STR PyUnicode_AS_UNICODE
4739
4740Py_LOCAL_INLINE(int)
4741STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004743 if (str[0] != other[0])
4744 return 1;
4745 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746}
4747
Thomas Wouters477c8d52006-05-27 19:21:47 +00004748#define STRINGLIB_EMPTY unicode_empty
4749
4750#include "stringlib/fastsearch.h"
4751
4752#include "stringlib/count.h"
4753#include "stringlib/find.h"
4754#include "stringlib/partition.h"
4755
4756/* helper macro to fixup start/end slice values */
4757#define FIX_START_END(obj) \
4758 if (start < 0) \
4759 start += (obj)->length; \
4760 if (start < 0) \
4761 start = 0; \
4762 if (end > (obj)->length) \
4763 end = (obj)->length; \
4764 if (end < 0) \
4765 end += (obj)->length; \
4766 if (end < 0) \
4767 end = 0;
4768
Martin v. Löwis18e16552006-02-15 17:27:45 +00004769Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004770 PyObject *substr,
4771 Py_ssize_t start,
4772 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004774 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004775 PyUnicodeObject* str_obj;
4776 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004777
Thomas Wouters477c8d52006-05-27 19:21:47 +00004778 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4779 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004781 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4782 if (!sub_obj) {
4783 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 return -1;
4785 }
Tim Petersced69f82003-09-16 20:30:58 +00004786
Thomas Wouters477c8d52006-05-27 19:21:47 +00004787 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004788
Thomas Wouters477c8d52006-05-27 19:21:47 +00004789 result = stringlib_count(
4790 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4791 );
4792
4793 Py_DECREF(sub_obj);
4794 Py_DECREF(str_obj);
4795
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 return result;
4797}
4798
Martin v. Löwis18e16552006-02-15 17:27:45 +00004799Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004800 PyObject *sub,
4801 Py_ssize_t start,
4802 Py_ssize_t end,
4803 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004805 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004806
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004808 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004809 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004810 sub = PyUnicode_FromObject(sub);
4811 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004812 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004813 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 }
Tim Petersced69f82003-09-16 20:30:58 +00004815
Thomas Wouters477c8d52006-05-27 19:21:47 +00004816 if (direction > 0)
4817 result = stringlib_find_slice(
4818 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4819 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4820 start, end
4821 );
4822 else
4823 result = stringlib_rfind_slice(
4824 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4825 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4826 start, end
4827 );
4828
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004830 Py_DECREF(sub);
4831
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 return result;
4833}
4834
Tim Petersced69f82003-09-16 20:30:58 +00004835static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836int tailmatch(PyUnicodeObject *self,
4837 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004838 Py_ssize_t start,
4839 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 int direction)
4841{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 if (substring->length == 0)
4843 return 1;
4844
Thomas Wouters477c8d52006-05-27 19:21:47 +00004845 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846
4847 end -= substring->length;
4848 if (end < start)
4849 return 0;
4850
4851 if (direction > 0) {
4852 if (Py_UNICODE_MATCH(self, end, substring))
4853 return 1;
4854 } else {
4855 if (Py_UNICODE_MATCH(self, start, substring))
4856 return 1;
4857 }
4858
4859 return 0;
4860}
4861
Martin v. Löwis18e16552006-02-15 17:27:45 +00004862Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004864 Py_ssize_t start,
4865 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 int direction)
4867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004868 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004869
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 str = PyUnicode_FromObject(str);
4871 if (str == NULL)
4872 return -1;
4873 substr = PyUnicode_FromObject(substr);
4874 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004875 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 return -1;
4877 }
Tim Petersced69f82003-09-16 20:30:58 +00004878
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 result = tailmatch((PyUnicodeObject *)str,
4880 (PyUnicodeObject *)substr,
4881 start, end, direction);
4882 Py_DECREF(str);
4883 Py_DECREF(substr);
4884 return result;
4885}
4886
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887/* Apply fixfct filter to the Unicode object self and return a
4888 reference to the modified object */
4889
Tim Petersced69f82003-09-16 20:30:58 +00004890static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891PyObject *fixup(PyUnicodeObject *self,
4892 int (*fixfct)(PyUnicodeObject *s))
4893{
4894
4895 PyUnicodeObject *u;
4896
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004897 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 if (u == NULL)
4899 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004900
4901 Py_UNICODE_COPY(u->str, self->str, self->length);
4902
Tim Peters7a29bd52001-09-12 03:03:31 +00004903 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 /* fixfct should return TRUE if it modified the buffer. If
4905 FALSE, return a reference to the original buffer instead
4906 (to save space, not time) */
4907 Py_INCREF(self);
4908 Py_DECREF(u);
4909 return (PyObject*) self;
4910 }
4911 return (PyObject*) u;
4912}
4913
Tim Petersced69f82003-09-16 20:30:58 +00004914static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915int fixupper(PyUnicodeObject *self)
4916{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004917 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 Py_UNICODE *s = self->str;
4919 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004920
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 while (len-- > 0) {
4922 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004923
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 ch = Py_UNICODE_TOUPPER(*s);
4925 if (ch != *s) {
4926 status = 1;
4927 *s = ch;
4928 }
4929 s++;
4930 }
4931
4932 return status;
4933}
4934
Tim Petersced69f82003-09-16 20:30:58 +00004935static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936int fixlower(PyUnicodeObject *self)
4937{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004938 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 Py_UNICODE *s = self->str;
4940 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004941
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942 while (len-- > 0) {
4943 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004944
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 ch = Py_UNICODE_TOLOWER(*s);
4946 if (ch != *s) {
4947 status = 1;
4948 *s = ch;
4949 }
4950 s++;
4951 }
4952
4953 return status;
4954}
4955
Tim Petersced69f82003-09-16 20:30:58 +00004956static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957int fixswapcase(PyUnicodeObject *self)
4958{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004959 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 Py_UNICODE *s = self->str;
4961 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004962
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 while (len-- > 0) {
4964 if (Py_UNICODE_ISUPPER(*s)) {
4965 *s = Py_UNICODE_TOLOWER(*s);
4966 status = 1;
4967 } else if (Py_UNICODE_ISLOWER(*s)) {
4968 *s = Py_UNICODE_TOUPPER(*s);
4969 status = 1;
4970 }
4971 s++;
4972 }
4973
4974 return status;
4975}
4976
Tim Petersced69f82003-09-16 20:30:58 +00004977static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978int fixcapitalize(PyUnicodeObject *self)
4979{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004980 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004981 Py_UNICODE *s = self->str;
4982 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004983
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004984 if (len == 0)
4985 return 0;
4986 if (Py_UNICODE_ISLOWER(*s)) {
4987 *s = Py_UNICODE_TOUPPER(*s);
4988 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004990 s++;
4991 while (--len > 0) {
4992 if (Py_UNICODE_ISUPPER(*s)) {
4993 *s = Py_UNICODE_TOLOWER(*s);
4994 status = 1;
4995 }
4996 s++;
4997 }
4998 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999}
5000
5001static
5002int fixtitle(PyUnicodeObject *self)
5003{
5004 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5005 register Py_UNICODE *e;
5006 int previous_is_cased;
5007
5008 /* Shortcut for single character strings */
5009 if (PyUnicode_GET_SIZE(self) == 1) {
5010 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5011 if (*p != ch) {
5012 *p = ch;
5013 return 1;
5014 }
5015 else
5016 return 0;
5017 }
Tim Petersced69f82003-09-16 20:30:58 +00005018
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019 e = p + PyUnicode_GET_SIZE(self);
5020 previous_is_cased = 0;
5021 for (; p < e; p++) {
5022 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005023
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024 if (previous_is_cased)
5025 *p = Py_UNICODE_TOLOWER(ch);
5026 else
5027 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005028
5029 if (Py_UNICODE_ISLOWER(ch) ||
5030 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031 Py_UNICODE_ISTITLE(ch))
5032 previous_is_cased = 1;
5033 else
5034 previous_is_cased = 0;
5035 }
5036 return 1;
5037}
5038
Tim Peters8ce9f162004-08-27 01:49:32 +00005039PyObject *
5040PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041{
Tim Peters8ce9f162004-08-27 01:49:32 +00005042 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005043 const Py_UNICODE blank = ' ';
5044 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005045 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005046 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005047 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5048 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005049 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5050 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005051 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005052 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005053 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054
Tim Peters05eba1f2004-08-27 21:32:02 +00005055 fseq = PySequence_Fast(seq, "");
5056 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005057 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005058 }
5059
Tim Peters91879ab2004-08-27 22:35:44 +00005060 /* Grrrr. A codec may be invoked to convert str objects to
5061 * Unicode, and so it's possible to call back into Python code
5062 * during PyUnicode_FromObject(), and so it's possible for a sick
5063 * codec to change the size of fseq (if seq is a list). Therefore
5064 * we have to keep refetching the size -- can't assume seqlen
5065 * is invariant.
5066 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005067 seqlen = PySequence_Fast_GET_SIZE(fseq);
5068 /* If empty sequence, return u"". */
5069 if (seqlen == 0) {
5070 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5071 goto Done;
5072 }
5073 /* If singleton sequence with an exact Unicode, return that. */
5074 if (seqlen == 1) {
5075 item = PySequence_Fast_GET_ITEM(fseq, 0);
5076 if (PyUnicode_CheckExact(item)) {
5077 Py_INCREF(item);
5078 res = (PyUnicodeObject *)item;
5079 goto Done;
5080 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005081 }
5082
Tim Peters05eba1f2004-08-27 21:32:02 +00005083 /* At least two items to join, or one that isn't exact Unicode. */
5084 if (seqlen > 1) {
5085 /* Set up sep and seplen -- they're needed. */
5086 if (separator == NULL) {
5087 sep = &blank;
5088 seplen = 1;
5089 }
5090 else {
5091 internal_separator = PyUnicode_FromObject(separator);
5092 if (internal_separator == NULL)
5093 goto onError;
5094 sep = PyUnicode_AS_UNICODE(internal_separator);
5095 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005096 /* In case PyUnicode_FromObject() mutated seq. */
5097 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005098 }
5099 }
5100
5101 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005102 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005103 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005104 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005105 res_p = PyUnicode_AS_UNICODE(res);
5106 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005107
Tim Peters05eba1f2004-08-27 21:32:02 +00005108 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005109 Py_ssize_t itemlen;
5110 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005111
5112 item = PySequence_Fast_GET_ITEM(fseq, i);
5113 /* Convert item to Unicode. */
5114 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5115 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005116 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005117 " %.80s found",
5118 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005119 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005120 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005121 item = PyUnicode_FromObject(item);
5122 if (item == NULL)
5123 goto onError;
5124 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005125
Tim Peters91879ab2004-08-27 22:35:44 +00005126 /* In case PyUnicode_FromObject() mutated seq. */
5127 seqlen = PySequence_Fast_GET_SIZE(fseq);
5128
Tim Peters8ce9f162004-08-27 01:49:32 +00005129 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005131 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005132 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005133 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005134 if (i < seqlen - 1) {
5135 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005136 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005137 goto Overflow;
5138 }
5139 if (new_res_used > res_alloc) {
5140 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005141 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005142 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005143 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005144 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005145 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005146 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005147 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005149 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005150 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005152
5153 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005154 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005155 res_p += itemlen;
5156 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005157 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005158 res_p += seplen;
5159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005161 res_used = new_res_used;
5162 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005163
Tim Peters05eba1f2004-08-27 21:32:02 +00005164 /* Shrink res to match the used area; this probably can't fail,
5165 * but it's cheap to check.
5166 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005167 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005168 goto onError;
5169
5170 Done:
5171 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005172 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 return (PyObject *)res;
5174
Tim Peters8ce9f162004-08-27 01:49:32 +00005175 Overflow:
5176 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005177 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005178 Py_DECREF(item);
5179 /* fall through */
5180
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005182 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005183 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005184 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 return NULL;
5186}
5187
Tim Petersced69f82003-09-16 20:30:58 +00005188static
5189PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005190 Py_ssize_t left,
5191 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 Py_UNICODE fill)
5193{
5194 PyUnicodeObject *u;
5195
5196 if (left < 0)
5197 left = 0;
5198 if (right < 0)
5199 right = 0;
5200
Tim Peters7a29bd52001-09-12 03:03:31 +00005201 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 Py_INCREF(self);
5203 return self;
5204 }
5205
5206 u = _PyUnicode_New(left + self->length + right);
5207 if (u) {
5208 if (left)
5209 Py_UNICODE_FILL(u->str, fill, left);
5210 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5211 if (right)
5212 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5213 }
5214
5215 return u;
5216}
5217
5218#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005219 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 if (!str) \
5221 goto onError; \
5222 if (PyList_Append(list, str)) { \
5223 Py_DECREF(str); \
5224 goto onError; \
5225 } \
5226 else \
5227 Py_DECREF(str);
5228
5229static
5230PyObject *split_whitespace(PyUnicodeObject *self,
5231 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 register Py_ssize_t i;
5235 register Py_ssize_t j;
5236 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 PyObject *str;
5238
5239 for (i = j = 0; i < len; ) {
5240 /* find a token */
5241 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5242 i++;
5243 j = i;
5244 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5245 i++;
5246 if (j < i) {
5247 if (maxcount-- <= 0)
5248 break;
5249 SPLIT_APPEND(self->str, j, i);
5250 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5251 i++;
5252 j = i;
5253 }
5254 }
5255 if (j < len) {
5256 SPLIT_APPEND(self->str, j, len);
5257 }
5258 return list;
5259
5260 onError:
5261 Py_DECREF(list);
5262 return NULL;
5263}
5264
5265PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005266 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005268 register Py_ssize_t i;
5269 register Py_ssize_t j;
5270 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 PyObject *list;
5272 PyObject *str;
5273 Py_UNICODE *data;
5274
5275 string = PyUnicode_FromObject(string);
5276 if (string == NULL)
5277 return NULL;
5278 data = PyUnicode_AS_UNICODE(string);
5279 len = PyUnicode_GET_SIZE(string);
5280
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 list = PyList_New(0);
5282 if (!list)
5283 goto onError;
5284
5285 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005286 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005287
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005289 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291
5292 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005293 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 if (i < len) {
5295 if (data[i] == '\r' && i + 1 < len &&
5296 data[i+1] == '\n')
5297 i += 2;
5298 else
5299 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005300 if (keepends)
5301 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 }
Guido van Rossum86662912000-04-11 15:38:46 +00005303 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 j = i;
5305 }
5306 if (j < len) {
5307 SPLIT_APPEND(data, j, len);
5308 }
5309
5310 Py_DECREF(string);
5311 return list;
5312
5313 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005314 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 Py_DECREF(string);
5316 return NULL;
5317}
5318
Tim Petersced69f82003-09-16 20:30:58 +00005319static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320PyObject *split_char(PyUnicodeObject *self,
5321 PyObject *list,
5322 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005323 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005325 register Py_ssize_t i;
5326 register Py_ssize_t j;
5327 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 PyObject *str;
5329
5330 for (i = j = 0; i < len; ) {
5331 if (self->str[i] == ch) {
5332 if (maxcount-- <= 0)
5333 break;
5334 SPLIT_APPEND(self->str, j, i);
5335 i = j = i + 1;
5336 } else
5337 i++;
5338 }
5339 if (j <= len) {
5340 SPLIT_APPEND(self->str, j, len);
5341 }
5342 return list;
5343
5344 onError:
5345 Py_DECREF(list);
5346 return NULL;
5347}
5348
Tim Petersced69f82003-09-16 20:30:58 +00005349static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350PyObject *split_substring(PyUnicodeObject *self,
5351 PyObject *list,
5352 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005353 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005355 register Py_ssize_t i;
5356 register Py_ssize_t j;
5357 Py_ssize_t len = self->length;
5358 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 PyObject *str;
5360
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005361 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 if (Py_UNICODE_MATCH(self, i, substring)) {
5363 if (maxcount-- <= 0)
5364 break;
5365 SPLIT_APPEND(self->str, j, i);
5366 i = j = i + sublen;
5367 } else
5368 i++;
5369 }
5370 if (j <= len) {
5371 SPLIT_APPEND(self->str, j, len);
5372 }
5373 return list;
5374
5375 onError:
5376 Py_DECREF(list);
5377 return NULL;
5378}
5379
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005380static
5381PyObject *rsplit_whitespace(PyUnicodeObject *self,
5382 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005383 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005384{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005385 register Py_ssize_t i;
5386 register Py_ssize_t j;
5387 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005388 PyObject *str;
5389
5390 for (i = j = len - 1; i >= 0; ) {
5391 /* find a token */
5392 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5393 i--;
5394 j = i;
5395 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5396 i--;
5397 if (j > i) {
5398 if (maxcount-- <= 0)
5399 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005400 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005401 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5402 i--;
5403 j = i;
5404 }
5405 }
5406 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005407 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005408 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005409 if (PyList_Reverse(list) < 0)
5410 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005411 return list;
5412
5413 onError:
5414 Py_DECREF(list);
5415 return NULL;
5416}
5417
5418static
5419PyObject *rsplit_char(PyUnicodeObject *self,
5420 PyObject *list,
5421 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005422 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005423{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005424 register Py_ssize_t i;
5425 register Py_ssize_t j;
5426 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005427 PyObject *str;
5428
5429 for (i = j = len - 1; i >= 0; ) {
5430 if (self->str[i] == ch) {
5431 if (maxcount-- <= 0)
5432 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005433 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005434 j = i = i - 1;
5435 } else
5436 i--;
5437 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005438 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005439 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005440 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005441 if (PyList_Reverse(list) < 0)
5442 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005443 return list;
5444
5445 onError:
5446 Py_DECREF(list);
5447 return NULL;
5448}
5449
5450static
5451PyObject *rsplit_substring(PyUnicodeObject *self,
5452 PyObject *list,
5453 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005454 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005455{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005456 register Py_ssize_t i;
5457 register Py_ssize_t j;
5458 Py_ssize_t len = self->length;
5459 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005460 PyObject *str;
5461
5462 for (i = len - sublen, j = len; i >= 0; ) {
5463 if (Py_UNICODE_MATCH(self, i, substring)) {
5464 if (maxcount-- <= 0)
5465 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005466 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005467 j = i;
5468 i -= sublen;
5469 } else
5470 i--;
5471 }
5472 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005473 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005474 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005475 if (PyList_Reverse(list) < 0)
5476 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005477 return list;
5478
5479 onError:
5480 Py_DECREF(list);
5481 return NULL;
5482}
5483
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484#undef SPLIT_APPEND
5485
5486static
5487PyObject *split(PyUnicodeObject *self,
5488 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005489 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490{
5491 PyObject *list;
5492
5493 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005494 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495
5496 list = PyList_New(0);
5497 if (!list)
5498 return NULL;
5499
5500 if (substring == NULL)
5501 return split_whitespace(self,list,maxcount);
5502
5503 else if (substring->length == 1)
5504 return split_char(self,list,substring->str[0],maxcount);
5505
5506 else if (substring->length == 0) {
5507 Py_DECREF(list);
5508 PyErr_SetString(PyExc_ValueError, "empty separator");
5509 return NULL;
5510 }
5511 else
5512 return split_substring(self,list,substring,maxcount);
5513}
5514
Tim Petersced69f82003-09-16 20:30:58 +00005515static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005516PyObject *rsplit(PyUnicodeObject *self,
5517 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005518 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005519{
5520 PyObject *list;
5521
5522 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005523 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005524
5525 list = PyList_New(0);
5526 if (!list)
5527 return NULL;
5528
5529 if (substring == NULL)
5530 return rsplit_whitespace(self,list,maxcount);
5531
5532 else if (substring->length == 1)
5533 return rsplit_char(self,list,substring->str[0],maxcount);
5534
5535 else if (substring->length == 0) {
5536 Py_DECREF(list);
5537 PyErr_SetString(PyExc_ValueError, "empty separator");
5538 return NULL;
5539 }
5540 else
5541 return rsplit_substring(self,list,substring,maxcount);
5542}
5543
5544static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545PyObject *replace(PyUnicodeObject *self,
5546 PyUnicodeObject *str1,
5547 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005548 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549{
5550 PyUnicodeObject *u;
5551
5552 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005553 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554
Thomas Wouters477c8d52006-05-27 19:21:47 +00005555 if (str1->length == str2->length) {
5556 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005557 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005558 if (str1->length == 1) {
5559 /* replace characters */
5560 Py_UNICODE u1, u2;
5561 if (!findchar(self->str, self->length, str1->str[0]))
5562 goto nothing;
5563 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5564 if (!u)
5565 return NULL;
5566 Py_UNICODE_COPY(u->str, self->str, self->length);
5567 u1 = str1->str[0];
5568 u2 = str2->str[0];
5569 for (i = 0; i < u->length; i++)
5570 if (u->str[i] == u1) {
5571 if (--maxcount < 0)
5572 break;
5573 u->str[i] = u2;
5574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005576 i = fastsearch(
5577 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005579 if (i < 0)
5580 goto nothing;
5581 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5582 if (!u)
5583 return NULL;
5584 Py_UNICODE_COPY(u->str, self->str, self->length);
5585 while (i <= self->length - str1->length)
5586 if (Py_UNICODE_MATCH(self, i, str1)) {
5587 if (--maxcount < 0)
5588 break;
5589 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5590 i += str1->length;
5591 } else
5592 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005595
5596 Py_ssize_t n, i, j, e;
5597 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 Py_UNICODE *p;
5599
5600 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005601 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 if (n > maxcount)
5603 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005604 if (n == 0)
5605 goto nothing;
5606 /* new_size = self->length + n * (str2->length - str1->length)); */
5607 delta = (str2->length - str1->length);
5608 if (delta == 0) {
5609 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005611 product = n * (str2->length - str1->length);
5612 if ((product / (str2->length - str1->length)) != n) {
5613 PyErr_SetString(PyExc_OverflowError,
5614 "replace string is too long");
5615 return NULL;
5616 }
5617 new_size = self->length + product;
5618 if (new_size < 0) {
5619 PyErr_SetString(PyExc_OverflowError,
5620 "replace string is too long");
5621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 }
5623 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005624 u = _PyUnicode_New(new_size);
5625 if (!u)
5626 return NULL;
5627 i = 0;
5628 p = u->str;
5629 e = self->length - str1->length;
5630 if (str1->length > 0) {
5631 while (n-- > 0) {
5632 /* look for next match */
5633 j = i;
5634 while (j <= e) {
5635 if (Py_UNICODE_MATCH(self, j, str1))
5636 break;
5637 j++;
5638 }
5639 if (j > i) {
5640 if (j > e)
5641 break;
5642 /* copy unchanged part [i:j] */
5643 Py_UNICODE_COPY(p, self->str+i, j-i);
5644 p += j - i;
5645 }
5646 /* copy substitution string */
5647 if (str2->length > 0) {
5648 Py_UNICODE_COPY(p, str2->str, str2->length);
5649 p += str2->length;
5650 }
5651 i = j + str1->length;
5652 }
5653 if (i < self->length)
5654 /* copy tail [i:] */
5655 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5656 } else {
5657 /* interleave */
5658 while (n > 0) {
5659 Py_UNICODE_COPY(p, str2->str, str2->length);
5660 p += str2->length;
5661 if (--n <= 0)
5662 break;
5663 *p++ = self->str[i++];
5664 }
5665 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005669
5670nothing:
5671 /* nothing to replace; return original string (when possible) */
5672 if (PyUnicode_CheckExact(self)) {
5673 Py_INCREF(self);
5674 return (PyObject *) self;
5675 }
5676 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677}
5678
5679/* --- Unicode Object Methods --------------------------------------------- */
5680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005681PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682"S.title() -> unicode\n\
5683\n\
5684Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005685characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686
5687static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005688unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 return fixup(self, fixtitle);
5691}
5692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005693PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694"S.capitalize() -> unicode\n\
5695\n\
5696Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005697have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698
5699static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005700unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 return fixup(self, fixcapitalize);
5703}
5704
5705#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005706PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707"S.capwords() -> unicode\n\
5708\n\
5709Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005710normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711
5712static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005713unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714{
5715 PyObject *list;
5716 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005717 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 /* Split into words */
5720 list = split(self, NULL, -1);
5721 if (!list)
5722 return NULL;
5723
5724 /* Capitalize each word */
5725 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5726 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5727 fixcapitalize);
5728 if (item == NULL)
5729 goto onError;
5730 Py_DECREF(PyList_GET_ITEM(list, i));
5731 PyList_SET_ITEM(list, i, item);
5732 }
5733
5734 /* Join the words to form a new string */
5735 item = PyUnicode_Join(NULL, list);
5736
5737onError:
5738 Py_DECREF(list);
5739 return (PyObject *)item;
5740}
5741#endif
5742
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005743/* Argument converter. Coerces to a single unicode character */
5744
5745static int
5746convert_uc(PyObject *obj, void *addr)
5747{
5748 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5749 PyObject *uniobj;
5750 Py_UNICODE *unistr;
5751
5752 uniobj = PyUnicode_FromObject(obj);
5753 if (uniobj == NULL) {
5754 PyErr_SetString(PyExc_TypeError,
5755 "The fill character cannot be converted to Unicode");
5756 return 0;
5757 }
5758 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5759 PyErr_SetString(PyExc_TypeError,
5760 "The fill character must be exactly one character long");
5761 Py_DECREF(uniobj);
5762 return 0;
5763 }
5764 unistr = PyUnicode_AS_UNICODE(uniobj);
5765 *fillcharloc = unistr[0];
5766 Py_DECREF(uniobj);
5767 return 1;
5768}
5769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005770PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005771"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005773Return S centered in a Unicode string of length width. Padding is\n\
5774done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775
5776static PyObject *
5777unicode_center(PyUnicodeObject *self, PyObject *args)
5778{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005779 Py_ssize_t marg, left;
5780 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005781 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782
Thomas Woutersde017742006-02-16 19:34:37 +00005783 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 return NULL;
5785
Tim Peters7a29bd52001-09-12 03:03:31 +00005786 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 Py_INCREF(self);
5788 return (PyObject*) self;
5789 }
5790
5791 marg = width - self->length;
5792 left = marg / 2 + (marg & width & 1);
5793
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005794 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795}
5796
Marc-André Lemburge5034372000-08-08 08:04:29 +00005797#if 0
5798
5799/* This code should go into some future Unicode collation support
5800 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005801 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005802
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005803/* speedy UTF-16 code point order comparison */
5804/* gleaned from: */
5805/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5806
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005807static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005808{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005809 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005810 0, 0, 0, 0, 0, 0, 0, 0,
5811 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005812 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005813};
5814
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815static int
5816unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5817{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005818 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005819
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 Py_UNICODE *s1 = str1->str;
5821 Py_UNICODE *s2 = str2->str;
5822
5823 len1 = str1->length;
5824 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005825
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005827 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005828
5829 c1 = *s1++;
5830 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005831
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005832 if (c1 > (1<<11) * 26)
5833 c1 += utf16Fixup[c1>>11];
5834 if (c2 > (1<<11) * 26)
5835 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005836 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005837
5838 if (c1 != c2)
5839 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005840
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005841 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 }
5843
5844 return (len1 < len2) ? -1 : (len1 != len2);
5845}
5846
Marc-André Lemburge5034372000-08-08 08:04:29 +00005847#else
5848
5849static int
5850unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5851{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005852 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005853
5854 Py_UNICODE *s1 = str1->str;
5855 Py_UNICODE *s2 = str2->str;
5856
5857 len1 = str1->length;
5858 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005859
Marc-André Lemburge5034372000-08-08 08:04:29 +00005860 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005861 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005862
Fredrik Lundh45714e92001-06-26 16:39:36 +00005863 c1 = *s1++;
5864 c2 = *s2++;
5865
5866 if (c1 != c2)
5867 return (c1 < c2) ? -1 : 1;
5868
Marc-André Lemburge5034372000-08-08 08:04:29 +00005869 len1--; len2--;
5870 }
5871
5872 return (len1 < len2) ? -1 : (len1 != len2);
5873}
5874
5875#endif
5876
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877int PyUnicode_Compare(PyObject *left,
5878 PyObject *right)
5879{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005880 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5881 return unicode_compare((PyUnicodeObject *)left,
5882 (PyUnicodeObject *)right);
5883 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5884 (PyUnicode_Check(left) && PyString_Check(right))) {
5885 if (PyUnicode_Check(left))
5886 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5887 if (PyUnicode_Check(right))
5888 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5889 assert(PyString_Check(left));
5890 assert(PyString_Check(right));
5891 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005893 PyErr_Format(PyExc_TypeError,
5894 "Can't compare %.100s and %.100s",
5895 left->ob_type->tp_name,
5896 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 return -1;
5898}
5899
Martin v. Löwis5b222132007-06-10 09:51:05 +00005900int
5901PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5902{
5903 int i;
5904 Py_UNICODE *id;
5905 assert(PyUnicode_Check(uni));
5906 id = PyUnicode_AS_UNICODE(uni);
5907 /* Compare Unicode string and source character set string */
5908 for (i = 0; id[i] && str[i]; i++)
5909 if (id[i] != str[i])
5910 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5911 if (id[i])
5912 return 1; /* uni is longer */
5913 if (str[i])
5914 return -1; /* str is longer */
5915 return 0;
5916}
5917
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005918PyObject *PyUnicode_RichCompare(PyObject *left,
5919 PyObject *right,
5920 int op)
5921{
5922 int result;
5923
5924 result = PyUnicode_Compare(left, right);
5925 if (result == -1 && PyErr_Occurred())
5926 goto onError;
5927
5928 /* Convert the return value to a Boolean */
5929 switch (op) {
5930 case Py_EQ:
5931 result = (result == 0);
5932 break;
5933 case Py_NE:
5934 result = (result != 0);
5935 break;
5936 case Py_LE:
5937 result = (result <= 0);
5938 break;
5939 case Py_GE:
5940 result = (result >= 0);
5941 break;
5942 case Py_LT:
5943 result = (result == -1);
5944 break;
5945 case Py_GT:
5946 result = (result == 1);
5947 break;
5948 }
5949 return PyBool_FromLong(result);
5950
5951 onError:
5952
5953 /* Standard case
5954
5955 Type errors mean that PyUnicode_FromObject() could not convert
5956 one of the arguments (usually the right hand side) to Unicode,
5957 ie. we can't handle the comparison request. However, it is
5958 possible that the other object knows a comparison method, which
5959 is why we return Py_NotImplemented to give the other object a
5960 chance.
5961
5962 */
5963 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5964 PyErr_Clear();
5965 Py_INCREF(Py_NotImplemented);
5966 return Py_NotImplemented;
5967 }
5968 if (op != Py_EQ && op != Py_NE)
5969 return NULL;
5970
5971 /* Equality comparison.
5972
5973 This is a special case: we silence any PyExc_UnicodeDecodeError
5974 and instead turn it into a PyErr_UnicodeWarning.
5975
5976 */
5977 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5978 return NULL;
5979 PyErr_Clear();
5980 if (PyErr_Warn(PyExc_UnicodeWarning,
5981 (op == Py_EQ) ?
5982 "Unicode equal comparison "
5983 "failed to convert both arguments to Unicode - "
5984 "interpreting them as being unequal" :
5985 "Unicode unequal comparison "
5986 "failed to convert both arguments to Unicode - "
5987 "interpreting them as being unequal"
5988 ) < 0)
5989 return NULL;
5990 result = (op == Py_NE);
5991 return PyBool_FromLong(result);
5992}
5993
Guido van Rossum403d68b2000-03-13 15:55:09 +00005994int PyUnicode_Contains(PyObject *container,
5995 PyObject *element)
5996{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005997 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005998 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005999
6000 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006001 sub = PyUnicode_FromObject(element);
6002 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006003 PyErr_Format(PyExc_TypeError,
6004 "'in <string>' requires string as left operand, not %s",
6005 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006006 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006007 }
6008
Thomas Wouters477c8d52006-05-27 19:21:47 +00006009 str = PyUnicode_FromObject(container);
6010 if (!str) {
6011 Py_DECREF(sub);
6012 return -1;
6013 }
6014
6015 result = stringlib_contains_obj(str, sub);
6016
6017 Py_DECREF(str);
6018 Py_DECREF(sub);
6019
Guido van Rossum403d68b2000-03-13 15:55:09 +00006020 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006021}
6022
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023/* Concat to string or Unicode object giving a new Unicode object. */
6024
6025PyObject *PyUnicode_Concat(PyObject *left,
6026 PyObject *right)
6027{
6028 PyUnicodeObject *u = NULL, *v = NULL, *w;
6029
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006030 if (PyBytes_Check(left) || PyBytes_Check(right))
6031 return PyBytes_Concat(left, right);
6032
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 /* Coerce the two arguments */
6034 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6035 if (u == NULL)
6036 goto onError;
6037 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6038 if (v == NULL)
6039 goto onError;
6040
6041 /* Shortcuts */
6042 if (v == unicode_empty) {
6043 Py_DECREF(v);
6044 return (PyObject *)u;
6045 }
6046 if (u == unicode_empty) {
6047 Py_DECREF(u);
6048 return (PyObject *)v;
6049 }
6050
6051 /* Concat the two Unicode strings */
6052 w = _PyUnicode_New(u->length + v->length);
6053 if (w == NULL)
6054 goto onError;
6055 Py_UNICODE_COPY(w->str, u->str, u->length);
6056 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6057
6058 Py_DECREF(u);
6059 Py_DECREF(v);
6060 return (PyObject *)w;
6061
6062onError:
6063 Py_XDECREF(u);
6064 Py_XDECREF(v);
6065 return NULL;
6066}
6067
Walter Dörwald1ab83302007-05-18 17:15:44 +00006068void
6069PyUnicode_Append(PyObject **pleft, PyObject *right)
6070{
6071 PyObject *new;
6072 if (*pleft == NULL)
6073 return;
6074 if (right == NULL || !PyUnicode_Check(*pleft)) {
6075 Py_DECREF(*pleft);
6076 *pleft = NULL;
6077 return;
6078 }
6079 new = PyUnicode_Concat(*pleft, right);
6080 Py_DECREF(*pleft);
6081 *pleft = new;
6082}
6083
6084void
6085PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6086{
6087 PyUnicode_Append(pleft, right);
6088 Py_XDECREF(right);
6089}
6090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006091PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092"S.count(sub[, start[, end]]) -> int\n\
6093\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006094Return the number of non-overlapping occurrences of substring sub in\n\
6095Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006096interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097
6098static PyObject *
6099unicode_count(PyUnicodeObject *self, PyObject *args)
6100{
6101 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006102 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006103 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 PyObject *result;
6105
Guido van Rossumb8872e62000-05-09 14:14:27 +00006106 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6107 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 return NULL;
6109
6110 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006111 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 if (substring == NULL)
6113 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006114
Thomas Wouters477c8d52006-05-27 19:21:47 +00006115 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
Thomas Wouters477c8d52006-05-27 19:21:47 +00006117 result = PyInt_FromSsize_t(
6118 stringlib_count(self->str + start, end - start,
6119 substring->str, substring->length)
6120 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
6122 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006123
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 return result;
6125}
6126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006127PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006128"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006130Encodes S using the codec registered for encoding. encoding defaults\n\
6131to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006132handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6134'xmlcharrefreplace' as well as any other name registered with\n\
6135codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136
6137static PyObject *
6138unicode_encode(PyUnicodeObject *self, PyObject *args)
6139{
6140 char *encoding = NULL;
6141 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006142 PyObject *v;
6143
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6145 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006146 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006147 if (v == NULL)
6148 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006149 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006150 if (PyString_Check(v)) {
6151 /* Old codec, turn it into bytes */
6152 PyObject *b = PyBytes_FromObject(v);
6153 Py_DECREF(v);
6154 return b;
6155 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006156 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006157 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006158 "(type=%.400s)",
6159 v->ob_type->tp_name);
6160 Py_DECREF(v);
6161 return NULL;
6162 }
6163 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006164
6165 onError:
6166 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006167}
6168
6169PyDoc_STRVAR(decode__doc__,
6170"S.decode([encoding[,errors]]) -> string or unicode\n\
6171\n\
6172Decodes S using the codec registered for encoding. encoding defaults\n\
6173to the default encoding. errors may be given to set a different error\n\
6174handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6175a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6176as well as any other name registerd with codecs.register_error that is\n\
6177able to handle UnicodeDecodeErrors.");
6178
6179static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006180unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006181{
6182 char *encoding = NULL;
6183 char *errors = NULL;
6184 PyObject *v;
6185
6186 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6187 return NULL;
6188 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006189 if (v == NULL)
6190 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006191 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6192 PyErr_Format(PyExc_TypeError,
6193 "decoder did not return a string/unicode object "
6194 "(type=%.400s)",
6195 v->ob_type->tp_name);
6196 Py_DECREF(v);
6197 return NULL;
6198 }
6199 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006200
6201 onError:
6202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203}
6204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006205PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206"S.expandtabs([tabsize]) -> unicode\n\
6207\n\
6208Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006209If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210
6211static PyObject*
6212unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6213{
6214 Py_UNICODE *e;
6215 Py_UNICODE *p;
6216 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006217 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 PyUnicodeObject *u;
6219 int tabsize = 8;
6220
6221 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6222 return NULL;
6223
Thomas Wouters7e474022000-07-16 12:04:32 +00006224 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006225 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 e = self->str + self->length;
6227 for (p = self->str; p < e; p++)
6228 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006229 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006231 if (old_j > j) {
6232 PyErr_SetString(PyExc_OverflowError,
6233 "new string is too long");
6234 return NULL;
6235 }
6236 old_j = j;
6237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 }
6239 else {
6240 j++;
6241 if (*p == '\n' || *p == '\r') {
6242 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006243 old_j = j = 0;
6244 if (i < 0) {
6245 PyErr_SetString(PyExc_OverflowError,
6246 "new string is too long");
6247 return NULL;
6248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 }
6250 }
6251
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006252 if ((i + j) < 0) {
6253 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6254 return NULL;
6255 }
6256
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 /* Second pass: create output string and fill it */
6258 u = _PyUnicode_New(i + j);
6259 if (!u)
6260 return NULL;
6261
6262 j = 0;
6263 q = u->str;
6264
6265 for (p = self->str; p < e; p++)
6266 if (*p == '\t') {
6267 if (tabsize > 0) {
6268 i = tabsize - (j % tabsize);
6269 j += i;
6270 while (i--)
6271 *q++ = ' ';
6272 }
6273 }
6274 else {
6275 j++;
6276 *q++ = *p;
6277 if (*p == '\n' || *p == '\r')
6278 j = 0;
6279 }
6280
6281 return (PyObject*) u;
6282}
6283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006284PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285"S.find(sub [,start [,end]]) -> int\n\
6286\n\
6287Return the lowest index in S where substring sub is found,\n\
6288such that sub is contained within s[start,end]. Optional\n\
6289arguments start and end are interpreted as in slice notation.\n\
6290\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006291Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292
6293static PyObject *
6294unicode_find(PyUnicodeObject *self, PyObject *args)
6295{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006296 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006297 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006298 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006299 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300
Guido van Rossumb8872e62000-05-09 14:14:27 +00006301 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6302 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006304 substring = PyUnicode_FromObject(substring);
6305 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 return NULL;
6307
Thomas Wouters477c8d52006-05-27 19:21:47 +00006308 result = stringlib_find_slice(
6309 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6310 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6311 start, end
6312 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
6314 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006315
6316 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317}
6318
6319static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006320unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321{
6322 if (index < 0 || index >= self->length) {
6323 PyErr_SetString(PyExc_IndexError, "string index out of range");
6324 return NULL;
6325 }
6326
6327 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6328}
6329
6330static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006331unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006333 /* Since Unicode objects compare equal to their UTF-8 string
6334 counterparts, we hash the UTF-8 string. */
6335 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6336 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337}
6338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006339PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340"S.index(sub [,start [,end]]) -> int\n\
6341\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006342Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
6344static PyObject *
6345unicode_index(PyUnicodeObject *self, PyObject *args)
6346{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006347 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006348 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006349 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006350 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351
Guido van Rossumb8872e62000-05-09 14:14:27 +00006352 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6353 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006355 substring = PyUnicode_FromObject(substring);
6356 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 return NULL;
6358
Thomas Wouters477c8d52006-05-27 19:21:47 +00006359 result = stringlib_find_slice(
6360 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6361 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6362 start, end
6363 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364
6365 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006366
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 if (result < 0) {
6368 PyErr_SetString(PyExc_ValueError, "substring not found");
6369 return NULL;
6370 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006371
Martin v. Löwis18e16552006-02-15 17:27:45 +00006372 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373}
6374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006375PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006376"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006378Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006379at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380
6381static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006382unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383{
6384 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6385 register const Py_UNICODE *e;
6386 int cased;
6387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 /* Shortcut for single character strings */
6389 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006390 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006392 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006393 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006394 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006395
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 e = p + PyUnicode_GET_SIZE(self);
6397 cased = 0;
6398 for (; p < e; p++) {
6399 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006400
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006402 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 else if (!cased && Py_UNICODE_ISLOWER(ch))
6404 cased = 1;
6405 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006406 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407}
6408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006409PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006410"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006412Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006413at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
6415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006416unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417{
6418 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6419 register const Py_UNICODE *e;
6420 int cased;
6421
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 /* Shortcut for single character strings */
6423 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006424 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006426 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006427 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006428 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006429
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 e = p + PyUnicode_GET_SIZE(self);
6431 cased = 0;
6432 for (; p < e; p++) {
6433 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006434
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006436 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 else if (!cased && Py_UNICODE_ISUPPER(ch))
6438 cased = 1;
6439 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006440 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441}
6442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006443PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006444"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006446Return True if S is a titlecased string and there is at least one\n\
6447character in S, i.e. upper- and titlecase characters may only\n\
6448follow uncased characters and lowercase characters only cased ones.\n\
6449Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450
6451static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006452unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453{
6454 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6455 register const Py_UNICODE *e;
6456 int cased, previous_is_cased;
6457
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 /* Shortcut for single character strings */
6459 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006460 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6461 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006463 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006464 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006465 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006466
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 e = p + PyUnicode_GET_SIZE(self);
6468 cased = 0;
6469 previous_is_cased = 0;
6470 for (; p < e; p++) {
6471 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006472
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6474 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006475 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 previous_is_cased = 1;
6477 cased = 1;
6478 }
6479 else if (Py_UNICODE_ISLOWER(ch)) {
6480 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006481 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 previous_is_cased = 1;
6483 cased = 1;
6484 }
6485 else
6486 previous_is_cased = 0;
6487 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006488 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489}
6490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006491PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006492"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006494Return True if all characters in S are whitespace\n\
6495and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496
6497static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006498unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499{
6500 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6501 register const Py_UNICODE *e;
6502
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 /* Shortcut for single character strings */
6504 if (PyUnicode_GET_SIZE(self) == 1 &&
6505 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006506 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006508 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006509 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006510 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006511
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 e = p + PyUnicode_GET_SIZE(self);
6513 for (; p < e; p++) {
6514 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006515 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006517 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518}
6519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006520PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006521"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006522\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006523Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006524and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006525
6526static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006527unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006528{
6529 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6530 register const Py_UNICODE *e;
6531
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006532 /* Shortcut for single character strings */
6533 if (PyUnicode_GET_SIZE(self) == 1 &&
6534 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006535 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006536
6537 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006538 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006539 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006540
6541 e = p + PyUnicode_GET_SIZE(self);
6542 for (; p < e; p++) {
6543 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006544 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006545 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006546 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006547}
6548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006549PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006550"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006551\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006552Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006553and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006554
6555static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006556unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006557{
6558 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6559 register const Py_UNICODE *e;
6560
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006561 /* Shortcut for single character strings */
6562 if (PyUnicode_GET_SIZE(self) == 1 &&
6563 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006564 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006565
6566 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006567 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006568 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006569
6570 e = p + PyUnicode_GET_SIZE(self);
6571 for (; p < e; p++) {
6572 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006573 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006574 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006575 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006576}
6577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006578PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006579"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006581Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006582False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583
6584static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006585unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586{
6587 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6588 register const Py_UNICODE *e;
6589
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 /* Shortcut for single character strings */
6591 if (PyUnicode_GET_SIZE(self) == 1 &&
6592 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006593 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006595 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006596 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006597 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 e = p + PyUnicode_GET_SIZE(self);
6600 for (; p < e; p++) {
6601 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006602 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006604 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605}
6606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006607PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006608"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006610Return True if all characters in S are digits\n\
6611and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
6613static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006614unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615{
6616 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6617 register const Py_UNICODE *e;
6618
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 /* Shortcut for single character strings */
6620 if (PyUnicode_GET_SIZE(self) == 1 &&
6621 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006622 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006624 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006625 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006626 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006627
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 e = p + PyUnicode_GET_SIZE(self);
6629 for (; p < e; p++) {
6630 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006631 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006633 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634}
6635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006636PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006637"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006639Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006640False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
6642static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006643unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644{
6645 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6646 register const Py_UNICODE *e;
6647
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 /* Shortcut for single character strings */
6649 if (PyUnicode_GET_SIZE(self) == 1 &&
6650 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006651 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006653 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006654 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006655 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006656
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 e = p + PyUnicode_GET_SIZE(self);
6658 for (; p < e; p++) {
6659 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006660 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006662 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663}
6664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006665PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666"S.join(sequence) -> unicode\n\
6667\n\
6668Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006669sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670
6671static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006672unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006674 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675}
6676
Martin v. Löwis18e16552006-02-15 17:27:45 +00006677static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678unicode_length(PyUnicodeObject *self)
6679{
6680 return self->length;
6681}
6682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006683PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006684"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685\n\
6686Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006687done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689static PyObject *
6690unicode_ljust(PyUnicodeObject *self, PyObject *args)
6691{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006692 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006693 Py_UNICODE fillchar = ' ';
6694
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006695 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 return NULL;
6697
Tim Peters7a29bd52001-09-12 03:03:31 +00006698 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 Py_INCREF(self);
6700 return (PyObject*) self;
6701 }
6702
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006703 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704}
6705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006706PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707"S.lower() -> unicode\n\
6708\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006709Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710
6711static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006712unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 return fixup(self, fixlower);
6715}
6716
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006717#define LEFTSTRIP 0
6718#define RIGHTSTRIP 1
6719#define BOTHSTRIP 2
6720
6721/* Arrays indexed by above */
6722static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6723
6724#define STRIPNAME(i) (stripformat[i]+3)
6725
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006726/* externally visible for str.strip(unicode) */
6727PyObject *
6728_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6729{
6730 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006731 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006732 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006733 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6734 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006735
Thomas Wouters477c8d52006-05-27 19:21:47 +00006736 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6737
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006738 i = 0;
6739 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006740 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6741 i++;
6742 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006743 }
6744
6745 j = len;
6746 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006747 do {
6748 j--;
6749 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6750 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006751 }
6752
6753 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006754 Py_INCREF(self);
6755 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006756 }
6757 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006758 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006759}
6760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761
6762static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006763do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006765 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006766 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006767
6768 i = 0;
6769 if (striptype != RIGHTSTRIP) {
6770 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6771 i++;
6772 }
6773 }
6774
6775 j = len;
6776 if (striptype != LEFTSTRIP) {
6777 do {
6778 j--;
6779 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6780 j++;
6781 }
6782
6783 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6784 Py_INCREF(self);
6785 return (PyObject*)self;
6786 }
6787 else
6788 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006791
6792static PyObject *
6793do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6794{
6795 PyObject *sep = NULL;
6796
6797 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6798 return NULL;
6799
6800 if (sep != NULL && sep != Py_None) {
6801 if (PyUnicode_Check(sep))
6802 return _PyUnicode_XStrip(self, striptype, sep);
6803 else if (PyString_Check(sep)) {
6804 PyObject *res;
6805 sep = PyUnicode_FromObject(sep);
6806 if (sep==NULL)
6807 return NULL;
6808 res = _PyUnicode_XStrip(self, striptype, sep);
6809 Py_DECREF(sep);
6810 return res;
6811 }
6812 else {
6813 PyErr_Format(PyExc_TypeError,
6814 "%s arg must be None, unicode or str",
6815 STRIPNAME(striptype));
6816 return NULL;
6817 }
6818 }
6819
6820 return do_strip(self, striptype);
6821}
6822
6823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006824PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006825"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006826\n\
6827Return a copy of the string S with leading and trailing\n\
6828whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006829If chars is given and not None, remove characters in chars instead.\n\
6830If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006831
6832static PyObject *
6833unicode_strip(PyUnicodeObject *self, PyObject *args)
6834{
6835 if (PyTuple_GET_SIZE(args) == 0)
6836 return do_strip(self, BOTHSTRIP); /* Common case */
6837 else
6838 return do_argstrip(self, BOTHSTRIP, args);
6839}
6840
6841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006842PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006843"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006844\n\
6845Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006846If chars is given and not None, remove characters in chars instead.\n\
6847If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006848
6849static PyObject *
6850unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6851{
6852 if (PyTuple_GET_SIZE(args) == 0)
6853 return do_strip(self, LEFTSTRIP); /* Common case */
6854 else
6855 return do_argstrip(self, LEFTSTRIP, args);
6856}
6857
6858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006859PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006860"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006861\n\
6862Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006863If chars is given and not None, remove characters in chars instead.\n\
6864If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006865
6866static PyObject *
6867unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6868{
6869 if (PyTuple_GET_SIZE(args) == 0)
6870 return do_strip(self, RIGHTSTRIP); /* Common case */
6871 else
6872 return do_argstrip(self, RIGHTSTRIP, args);
6873}
6874
6875
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006877unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878{
6879 PyUnicodeObject *u;
6880 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006881 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006882 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883
6884 if (len < 0)
6885 len = 0;
6886
Tim Peters7a29bd52001-09-12 03:03:31 +00006887 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 /* no repeat, return original string */
6889 Py_INCREF(str);
6890 return (PyObject*) str;
6891 }
Tim Peters8f422462000-09-09 06:13:41 +00006892
6893 /* ensure # of chars needed doesn't overflow int and # of bytes
6894 * needed doesn't overflow size_t
6895 */
6896 nchars = len * str->length;
6897 if (len && nchars / len != str->length) {
6898 PyErr_SetString(PyExc_OverflowError,
6899 "repeated string is too long");
6900 return NULL;
6901 }
6902 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6903 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6904 PyErr_SetString(PyExc_OverflowError,
6905 "repeated string is too long");
6906 return NULL;
6907 }
6908 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 if (!u)
6910 return NULL;
6911
6912 p = u->str;
6913
Thomas Wouters477c8d52006-05-27 19:21:47 +00006914 if (str->length == 1 && len > 0) {
6915 Py_UNICODE_FILL(p, str->str[0], len);
6916 } else {
6917 Py_ssize_t done = 0; /* number of characters copied this far */
6918 if (done < nchars) {
6919 Py_UNICODE_COPY(p, str->str, str->length);
6920 done = str->length;
6921 }
6922 while (done < nchars) {
6923 int n = (done <= nchars-done) ? done : nchars-done;
6924 Py_UNICODE_COPY(p+done, p, n);
6925 done += n;
6926 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 }
6928
6929 return (PyObject*) u;
6930}
6931
6932PyObject *PyUnicode_Replace(PyObject *obj,
6933 PyObject *subobj,
6934 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006935 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936{
6937 PyObject *self;
6938 PyObject *str1;
6939 PyObject *str2;
6940 PyObject *result;
6941
6942 self = PyUnicode_FromObject(obj);
6943 if (self == NULL)
6944 return NULL;
6945 str1 = PyUnicode_FromObject(subobj);
6946 if (str1 == NULL) {
6947 Py_DECREF(self);
6948 return NULL;
6949 }
6950 str2 = PyUnicode_FromObject(replobj);
6951 if (str2 == NULL) {
6952 Py_DECREF(self);
6953 Py_DECREF(str1);
6954 return NULL;
6955 }
Tim Petersced69f82003-09-16 20:30:58 +00006956 result = replace((PyUnicodeObject *)self,
6957 (PyUnicodeObject *)str1,
6958 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 maxcount);
6960 Py_DECREF(self);
6961 Py_DECREF(str1);
6962 Py_DECREF(str2);
6963 return result;
6964}
6965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006966PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967"S.replace (old, new[, maxsplit]) -> unicode\n\
6968\n\
6969Return a copy of S with all occurrences of substring\n\
6970old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
6973static PyObject*
6974unicode_replace(PyUnicodeObject *self, PyObject *args)
6975{
6976 PyUnicodeObject *str1;
6977 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006978 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 PyObject *result;
6980
Martin v. Löwis18e16552006-02-15 17:27:45 +00006981 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 return NULL;
6983 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6984 if (str1 == NULL)
6985 return NULL;
6986 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006987 if (str2 == NULL) {
6988 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
6992 result = replace(self, str1, str2, maxcount);
6993
6994 Py_DECREF(str1);
6995 Py_DECREF(str2);
6996 return result;
6997}
6998
6999static
7000PyObject *unicode_repr(PyObject *unicode)
7001{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007002 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007003 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007004 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7005 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7006
7007 /* XXX(nnorwitz): rather than over-allocating, it would be
7008 better to choose a different scheme. Perhaps scan the
7009 first N-chars of the string and allocate based on that size.
7010 */
7011 /* Initial allocation is based on the longest-possible unichr
7012 escape.
7013
7014 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7015 unichr, so in this case it's the longest unichr escape. In
7016 narrow (UTF-16) builds this is five chars per source unichr
7017 since there are two unichrs in the surrogate pair, so in narrow
7018 (UTF-16) builds it's not the longest unichr escape.
7019
7020 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7021 so in the narrow (UTF-16) build case it's the longest unichr
7022 escape.
7023 */
7024
Walter Dörwald1ab83302007-05-18 17:15:44 +00007025 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007026 2 /* quotes */
7027#ifdef Py_UNICODE_WIDE
7028 + 10*size
7029#else
7030 + 6*size
7031#endif
7032 + 1);
7033 if (repr == NULL)
7034 return NULL;
7035
Walter Dörwald1ab83302007-05-18 17:15:44 +00007036 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007037
7038 /* Add quote */
7039 *p++ = (findchar(s, size, '\'') &&
7040 !findchar(s, size, '"')) ? '"' : '\'';
7041 while (size-- > 0) {
7042 Py_UNICODE ch = *s++;
7043
7044 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007045 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007046 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007047 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007048 continue;
7049 }
7050
7051#ifdef Py_UNICODE_WIDE
7052 /* Map 21-bit characters to '\U00xxxxxx' */
7053 else if (ch >= 0x10000) {
7054 *p++ = '\\';
7055 *p++ = 'U';
7056 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7057 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7058 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7059 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7060 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7061 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7062 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7063 *p++ = hexdigits[ch & 0x0000000F];
7064 continue;
7065 }
7066#else
7067 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7068 else if (ch >= 0xD800 && ch < 0xDC00) {
7069 Py_UNICODE ch2;
7070 Py_UCS4 ucs;
7071
7072 ch2 = *s++;
7073 size--;
7074 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7075 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7076 *p++ = '\\';
7077 *p++ = 'U';
7078 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7079 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7080 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7081 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7082 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7083 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7084 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7085 *p++ = hexdigits[ucs & 0x0000000F];
7086 continue;
7087 }
7088 /* Fall through: isolated surrogates are copied as-is */
7089 s--;
7090 size++;
7091 }
7092#endif
7093
7094 /* Map 16-bit characters to '\uxxxx' */
7095 if (ch >= 256) {
7096 *p++ = '\\';
7097 *p++ = 'u';
7098 *p++ = hexdigits[(ch >> 12) & 0x000F];
7099 *p++ = hexdigits[(ch >> 8) & 0x000F];
7100 *p++ = hexdigits[(ch >> 4) & 0x000F];
7101 *p++ = hexdigits[ch & 0x000F];
7102 }
7103
7104 /* Map special whitespace to '\t', \n', '\r' */
7105 else if (ch == '\t') {
7106 *p++ = '\\';
7107 *p++ = 't';
7108 }
7109 else if (ch == '\n') {
7110 *p++ = '\\';
7111 *p++ = 'n';
7112 }
7113 else if (ch == '\r') {
7114 *p++ = '\\';
7115 *p++ = 'r';
7116 }
7117
7118 /* Map non-printable US ASCII to '\xhh' */
7119 else if (ch < ' ' || ch >= 0x7F) {
7120 *p++ = '\\';
7121 *p++ = 'x';
7122 *p++ = hexdigits[(ch >> 4) & 0x000F];
7123 *p++ = hexdigits[ch & 0x000F];
7124 }
7125
7126 /* Copy everything else as-is */
7127 else
7128 *p++ = (char) ch;
7129 }
7130 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007131 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007132
7133 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007134 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007135 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136}
7137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007138PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139"S.rfind(sub [,start [,end]]) -> int\n\
7140\n\
7141Return the highest index in S where substring sub is found,\n\
7142such that sub is contained within s[start,end]. Optional\n\
7143arguments start and end are interpreted as in slice notation.\n\
7144\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007145Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
7147static PyObject *
7148unicode_rfind(PyUnicodeObject *self, PyObject *args)
7149{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007150 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007151 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007152 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007153 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154
Guido van Rossumb8872e62000-05-09 14:14:27 +00007155 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7156 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007158 substring = PyUnicode_FromObject(substring);
7159 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 return NULL;
7161
Thomas Wouters477c8d52006-05-27 19:21:47 +00007162 result = stringlib_rfind_slice(
7163 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7164 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7165 start, end
7166 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167
7168 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007169
7170 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171}
7172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007173PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174"S.rindex(sub [,start [,end]]) -> int\n\
7175\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177
7178static PyObject *
7179unicode_rindex(PyUnicodeObject *self, PyObject *args)
7180{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007181 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007182 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007183 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007184 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185
Guido van Rossumb8872e62000-05-09 14:14:27 +00007186 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7187 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007189 substring = PyUnicode_FromObject(substring);
7190 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 return NULL;
7192
Thomas Wouters477c8d52006-05-27 19:21:47 +00007193 result = stringlib_rfind_slice(
7194 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7195 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7196 start, end
7197 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198
7199 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007200
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 if (result < 0) {
7202 PyErr_SetString(PyExc_ValueError, "substring not found");
7203 return NULL;
7204 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206}
7207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007208PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007209"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210\n\
7211Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007212done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
7214static PyObject *
7215unicode_rjust(PyUnicodeObject *self, PyObject *args)
7216{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007217 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007218 Py_UNICODE fillchar = ' ';
7219
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007220 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 return NULL;
7222
Tim Peters7a29bd52001-09-12 03:03:31 +00007223 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 Py_INCREF(self);
7225 return (PyObject*) self;
7226 }
7227
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007228 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229}
7230
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007232unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233{
7234 /* standard clamping */
7235 if (start < 0)
7236 start = 0;
7237 if (end < 0)
7238 end = 0;
7239 if (end > self->length)
7240 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007241 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 /* full slice, return original string */
7243 Py_INCREF(self);
7244 return (PyObject*) self;
7245 }
7246 if (start > end)
7247 start = end;
7248 /* copy slice */
7249 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7250 end - start);
7251}
7252
7253PyObject *PyUnicode_Split(PyObject *s,
7254 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256{
7257 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007258
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 s = PyUnicode_FromObject(s);
7260 if (s == NULL)
7261 return NULL;
7262 if (sep != NULL) {
7263 sep = PyUnicode_FromObject(sep);
7264 if (sep == NULL) {
7265 Py_DECREF(s);
7266 return NULL;
7267 }
7268 }
7269
7270 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7271
7272 Py_DECREF(s);
7273 Py_XDECREF(sep);
7274 return result;
7275}
7276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007277PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278"S.split([sep [,maxsplit]]) -> list of strings\n\
7279\n\
7280Return a list of the words in S, using sep as the\n\
7281delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007282splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007283any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
7285static PyObject*
7286unicode_split(PyUnicodeObject *self, PyObject *args)
7287{
7288 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007289 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290
Martin v. Löwis18e16552006-02-15 17:27:45 +00007291 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 return NULL;
7293
7294 if (substring == Py_None)
7295 return split(self, NULL, maxcount);
7296 else if (PyUnicode_Check(substring))
7297 return split(self, (PyUnicodeObject *)substring, maxcount);
7298 else
7299 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7300}
7301
Thomas Wouters477c8d52006-05-27 19:21:47 +00007302PyObject *
7303PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7304{
7305 PyObject* str_obj;
7306 PyObject* sep_obj;
7307 PyObject* out;
7308
7309 str_obj = PyUnicode_FromObject(str_in);
7310 if (!str_obj)
7311 return NULL;
7312 sep_obj = PyUnicode_FromObject(sep_in);
7313 if (!sep_obj) {
7314 Py_DECREF(str_obj);
7315 return NULL;
7316 }
7317
7318 out = stringlib_partition(
7319 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7320 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7321 );
7322
7323 Py_DECREF(sep_obj);
7324 Py_DECREF(str_obj);
7325
7326 return out;
7327}
7328
7329
7330PyObject *
7331PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7332{
7333 PyObject* str_obj;
7334 PyObject* sep_obj;
7335 PyObject* out;
7336
7337 str_obj = PyUnicode_FromObject(str_in);
7338 if (!str_obj)
7339 return NULL;
7340 sep_obj = PyUnicode_FromObject(sep_in);
7341 if (!sep_obj) {
7342 Py_DECREF(str_obj);
7343 return NULL;
7344 }
7345
7346 out = stringlib_rpartition(
7347 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7348 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7349 );
7350
7351 Py_DECREF(sep_obj);
7352 Py_DECREF(str_obj);
7353
7354 return out;
7355}
7356
7357PyDoc_STRVAR(partition__doc__,
7358"S.partition(sep) -> (head, sep, tail)\n\
7359\n\
7360Searches for the separator sep in S, and returns the part before it,\n\
7361the separator itself, and the part after it. If the separator is not\n\
7362found, returns S and two empty strings.");
7363
7364static PyObject*
7365unicode_partition(PyUnicodeObject *self, PyObject *separator)
7366{
7367 return PyUnicode_Partition((PyObject *)self, separator);
7368}
7369
7370PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007371"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007372\n\
7373Searches for the separator sep in S, starting at the end of S, and returns\n\
7374the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007375separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007376
7377static PyObject*
7378unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7379{
7380 return PyUnicode_RPartition((PyObject *)self, separator);
7381}
7382
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007383PyObject *PyUnicode_RSplit(PyObject *s,
7384 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007385 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007386{
7387 PyObject *result;
7388
7389 s = PyUnicode_FromObject(s);
7390 if (s == NULL)
7391 return NULL;
7392 if (sep != NULL) {
7393 sep = PyUnicode_FromObject(sep);
7394 if (sep == NULL) {
7395 Py_DECREF(s);
7396 return NULL;
7397 }
7398 }
7399
7400 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7401
7402 Py_DECREF(s);
7403 Py_XDECREF(sep);
7404 return result;
7405}
7406
7407PyDoc_STRVAR(rsplit__doc__,
7408"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7409\n\
7410Return a list of the words in S, using sep as the\n\
7411delimiter string, starting at the end of the string and\n\
7412working to the front. If maxsplit is given, at most maxsplit\n\
7413splits are done. If sep is not specified, any whitespace string\n\
7414is a separator.");
7415
7416static PyObject*
7417unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7418{
7419 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007420 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007421
Martin v. Löwis18e16552006-02-15 17:27:45 +00007422 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007423 return NULL;
7424
7425 if (substring == Py_None)
7426 return rsplit(self, NULL, maxcount);
7427 else if (PyUnicode_Check(substring))
7428 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7429 else
7430 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7431}
7432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007433PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007434"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435\n\
7436Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007437Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007438is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439
7440static PyObject*
7441unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7442{
Guido van Rossum86662912000-04-11 15:38:46 +00007443 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444
Guido van Rossum86662912000-04-11 15:38:46 +00007445 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 return NULL;
7447
Guido van Rossum86662912000-04-11 15:38:46 +00007448 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449}
7450
7451static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007452PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453{
Walter Dörwald346737f2007-05-31 10:44:43 +00007454 if (PyUnicode_CheckExact(self)) {
7455 Py_INCREF(self);
7456 return self;
7457 } else
7458 /* Subtype -- return genuine unicode string with the same value. */
7459 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7460 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461}
7462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007463PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464"S.swapcase() -> unicode\n\
7465\n\
7466Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007467and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468
7469static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007470unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 return fixup(self, fixswapcase);
7473}
7474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007475PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476"S.translate(table) -> unicode\n\
7477\n\
7478Return a copy of the string S, where all characters have been mapped\n\
7479through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007480Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7481Unmapped characters are left untouched. Characters mapped to None\n\
7482are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483
7484static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007485unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486{
Tim Petersced69f82003-09-16 20:30:58 +00007487 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007489 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 "ignore");
7491}
7492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007493PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494"S.upper() -> unicode\n\
7495\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
7498static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007499unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 return fixup(self, fixupper);
7502}
7503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007504PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505"S.zfill(width) -> unicode\n\
7506\n\
7507Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007508of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509
7510static PyObject *
7511unicode_zfill(PyUnicodeObject *self, PyObject *args)
7512{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007513 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 PyUnicodeObject *u;
7515
Martin v. Löwis18e16552006-02-15 17:27:45 +00007516 Py_ssize_t width;
7517 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 return NULL;
7519
7520 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007521 if (PyUnicode_CheckExact(self)) {
7522 Py_INCREF(self);
7523 return (PyObject*) self;
7524 }
7525 else
7526 return PyUnicode_FromUnicode(
7527 PyUnicode_AS_UNICODE(self),
7528 PyUnicode_GET_SIZE(self)
7529 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 }
7531
7532 fill = width - self->length;
7533
7534 u = pad(self, fill, 0, '0');
7535
Walter Dörwald068325e2002-04-15 13:36:47 +00007536 if (u == NULL)
7537 return NULL;
7538
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 if (u->str[fill] == '+' || u->str[fill] == '-') {
7540 /* move sign to beginning of string */
7541 u->str[0] = u->str[fill];
7542 u->str[fill] = '0';
7543 }
7544
7545 return (PyObject*) u;
7546}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547
7548#if 0
7549static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007550unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 return PyInt_FromLong(unicode_freelist_size);
7553}
7554#endif
7555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007556PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007557"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007559Return True if S starts with the specified prefix, False otherwise.\n\
7560With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007561With optional end, stop comparing S at that position.\n\
7562prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
7564static PyObject *
7565unicode_startswith(PyUnicodeObject *self,
7566 PyObject *args)
7567{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007568 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007570 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007571 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007572 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007574 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007575 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007577 if (PyTuple_Check(subobj)) {
7578 Py_ssize_t i;
7579 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7580 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7581 PyTuple_GET_ITEM(subobj, i));
7582 if (substring == NULL)
7583 return NULL;
7584 result = tailmatch(self, substring, start, end, -1);
7585 Py_DECREF(substring);
7586 if (result) {
7587 Py_RETURN_TRUE;
7588 }
7589 }
7590 /* nothing matched */
7591 Py_RETURN_FALSE;
7592 }
7593 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007595 return NULL;
7596 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007598 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599}
7600
7601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007602PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007603"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007605Return True if S ends with the specified suffix, False otherwise.\n\
7606With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007607With optional end, stop comparing S at that position.\n\
7608suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609
7610static PyObject *
7611unicode_endswith(PyUnicodeObject *self,
7612 PyObject *args)
7613{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007614 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007616 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007617 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007618 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007620 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7621 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007623 if (PyTuple_Check(subobj)) {
7624 Py_ssize_t i;
7625 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7626 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7627 PyTuple_GET_ITEM(subobj, i));
7628 if (substring == NULL)
7629 return NULL;
7630 result = tailmatch(self, substring, start, end, +1);
7631 Py_DECREF(substring);
7632 if (result) {
7633 Py_RETURN_TRUE;
7634 }
7635 }
7636 Py_RETURN_FALSE;
7637 }
7638 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007642 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007644 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645}
7646
7647
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007648
7649static PyObject *
7650unicode_getnewargs(PyUnicodeObject *v)
7651{
7652 return Py_BuildValue("(u#)", v->str, v->length);
7653}
7654
7655
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656static PyMethodDef unicode_methods[] = {
7657
7658 /* Order is according to common usage: often used methods should
7659 appear first, since lookup is done sequentially. */
7660
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007661 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7662 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7663 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007664 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007665 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7666 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7667 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7668 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7669 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7670 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7671 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007672 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007673 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7674 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7675 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007676 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007677 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007678/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7679 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7680 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7681 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007682 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007683 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007684 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007685 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007686 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7687 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7688 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7689 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7690 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7691 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7692 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7693 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7694 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7695 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7696 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7697 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7698 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7699 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007700 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007701#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007702 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703#endif
7704
7705#if 0
7706 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007707 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708#endif
7709
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007710 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 {NULL, NULL}
7712};
7713
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007714static PyObject *
7715unicode_mod(PyObject *v, PyObject *w)
7716{
7717 if (!PyUnicode_Check(v)) {
7718 Py_INCREF(Py_NotImplemented);
7719 return Py_NotImplemented;
7720 }
7721 return PyUnicode_Format(v, w);
7722}
7723
7724static PyNumberMethods unicode_as_number = {
7725 0, /*nb_add*/
7726 0, /*nb_subtract*/
7727 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007728 unicode_mod, /*nb_remainder*/
7729};
7730
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007732 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007733 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007734 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7735 (ssizeargfunc) unicode_getitem, /* sq_item */
7736 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 0, /* sq_ass_item */
7738 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007739 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740};
7741
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007742static PyObject*
7743unicode_subscript(PyUnicodeObject* self, PyObject* item)
7744{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007745 if (PyIndex_Check(item)) {
7746 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007747 if (i == -1 && PyErr_Occurred())
7748 return NULL;
7749 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007750 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007751 return unicode_getitem(self, i);
7752 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007753 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007754 Py_UNICODE* source_buf;
7755 Py_UNICODE* result_buf;
7756 PyObject* result;
7757
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007758 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007759 &start, &stop, &step, &slicelength) < 0) {
7760 return NULL;
7761 }
7762
7763 if (slicelength <= 0) {
7764 return PyUnicode_FromUnicode(NULL, 0);
7765 } else {
7766 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007767 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7768 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007769
7770 if (result_buf == NULL)
7771 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007772
7773 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7774 result_buf[i] = source_buf[cur];
7775 }
Tim Petersced69f82003-09-16 20:30:58 +00007776
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007777 result = PyUnicode_FromUnicode(result_buf, slicelength);
7778 PyMem_FREE(result_buf);
7779 return result;
7780 }
7781 } else {
7782 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7783 return NULL;
7784 }
7785}
7786
7787static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007788 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007789 (binaryfunc)unicode_subscript, /* mp_subscript */
7790 (objobjargproc)0, /* mp_ass_subscript */
7791};
7792
Martin v. Löwis18e16552006-02-15 17:27:45 +00007793static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007795 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 const void **ptr)
7797{
7798 if (index != 0) {
7799 PyErr_SetString(PyExc_SystemError,
7800 "accessing non-existent unicode segment");
7801 return -1;
7802 }
7803 *ptr = (void *) self->str;
7804 return PyUnicode_GET_DATA_SIZE(self);
7805}
7806
Martin v. Löwis18e16552006-02-15 17:27:45 +00007807static Py_ssize_t
7808unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809 const void **ptr)
7810{
7811 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007812 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813 return -1;
7814}
7815
7816static int
7817unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007818 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819{
7820 if (lenp)
7821 *lenp = PyUnicode_GET_DATA_SIZE(self);
7822 return 1;
7823}
7824
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007825static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007827 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 const void **ptr)
7829{
7830 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007831
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 if (index != 0) {
7833 PyErr_SetString(PyExc_SystemError,
7834 "accessing non-existent unicode segment");
7835 return -1;
7836 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007837 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 if (str == NULL)
7839 return -1;
7840 *ptr = (void *) PyString_AS_STRING(str);
7841 return PyString_GET_SIZE(str);
7842}
7843
7844/* Helpers for PyUnicode_Format() */
7845
7846static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007847getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007849 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 if (argidx < arglen) {
7851 (*p_argidx)++;
7852 if (arglen < 0)
7853 return args;
7854 else
7855 return PyTuple_GetItem(args, argidx);
7856 }
7857 PyErr_SetString(PyExc_TypeError,
7858 "not enough arguments for format string");
7859 return NULL;
7860}
7861
7862#define F_LJUST (1<<0)
7863#define F_SIGN (1<<1)
7864#define F_BLANK (1<<2)
7865#define F_ALT (1<<3)
7866#define F_ZERO (1<<4)
7867
Martin v. Löwis18e16552006-02-15 17:27:45 +00007868static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007869strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007871 register Py_ssize_t i;
7872 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873 for (i = len - 1; i >= 0; i--)
7874 buffer[i] = (Py_UNICODE) charbuffer[i];
7875
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876 return len;
7877}
7878
Neal Norwitzfc76d632006-01-10 06:03:13 +00007879static int
7880doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7881{
Tim Peters15231542006-02-16 01:08:01 +00007882 Py_ssize_t result;
7883
Neal Norwitzfc76d632006-01-10 06:03:13 +00007884 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007885 result = strtounicode(buffer, (char *)buffer);
7886 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007887}
7888
7889static int
7890longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7891{
Tim Peters15231542006-02-16 01:08:01 +00007892 Py_ssize_t result;
7893
Neal Norwitzfc76d632006-01-10 06:03:13 +00007894 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007895 result = strtounicode(buffer, (char *)buffer);
7896 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007897}
7898
Guido van Rossum078151d2002-08-11 04:24:12 +00007899/* XXX To save some code duplication, formatfloat/long/int could have been
7900 shared with stringobject.c, converting from 8-bit to Unicode after the
7901 formatting is done. */
7902
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903static int
7904formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007905 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906 int flags,
7907 int prec,
7908 int type,
7909 PyObject *v)
7910{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007911 /* fmt = '%#.' + `prec` + `type`
7912 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913 char fmt[20];
7914 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007915
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916 x = PyFloat_AsDouble(v);
7917 if (x == -1.0 && PyErr_Occurred())
7918 return -1;
7919 if (prec < 0)
7920 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7922 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007923 /* Worst case length calc to ensure no buffer overrun:
7924
7925 'g' formats:
7926 fmt = %#.<prec>g
7927 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7928 for any double rep.)
7929 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7930
7931 'f' formats:
7932 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7933 len = 1 + 50 + 1 + prec = 52 + prec
7934
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007935 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007936 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007937
7938 */
7939 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7940 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007941 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007942 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007943 return -1;
7944 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007945 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7946 (flags&F_ALT) ? "#" : "",
7947 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007948 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949}
7950
Tim Peters38fd5b62000-09-21 05:43:11 +00007951static PyObject*
7952formatlong(PyObject *val, int flags, int prec, int type)
7953{
7954 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007955 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00007956 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00007957 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007958
7959 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7960 if (!str)
7961 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007962 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00007963 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00007964 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007965}
7966
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967static int
7968formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007969 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 int flags,
7971 int prec,
7972 int type,
7973 PyObject *v)
7974{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007975 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007976 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7977 * + 1 + 1
7978 * = 24
7979 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007980 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007981 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 long x;
7983
7984 x = PyInt_AsLong(v);
7985 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007986 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007987 if (x < 0 && type == 'u') {
7988 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007989 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007990 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7991 sign = "-";
7992 else
7993 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007995 prec = 1;
7996
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007997 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7998 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007999 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008000 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008001 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008002 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008003 return -1;
8004 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008005
8006 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008007 (type == 'x' || type == 'X' || type == 'o')) {
8008 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008009 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008010 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008011 * - when 0 is being converted, the C standard leaves off
8012 * the '0x' or '0X', which is inconsistent with other
8013 * %#x/%#X conversions and inconsistent with Python's
8014 * hex() function
8015 * - there are platforms that violate the standard and
8016 * convert 0 with the '0x' or '0X'
8017 * (Metrowerks, Compaq Tru64)
8018 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008019 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008020 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008021 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008022 * We can achieve the desired consistency by inserting our
8023 * own '0x' or '0X' prefix, and substituting %x/%X in place
8024 * of %#x/%#X.
8025 *
8026 * Note that this is the same approach as used in
8027 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008028 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008029 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8030 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008031 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008032 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008033 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8034 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008035 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008036 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008037 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008038 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008039 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008040 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041}
8042
8043static int
8044formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008045 size_t buflen,
8046 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008048 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008049 if (PyUnicode_Check(v)) {
8050 if (PyUnicode_GET_SIZE(v) != 1)
8051 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008055 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008056 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008057 goto onError;
8058 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060
8061 else {
8062 /* Integer input truncated to a character */
8063 long x;
8064 x = PyInt_AsLong(v);
8065 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008066 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008067#ifdef Py_UNICODE_WIDE
8068 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008069 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008070 "%c arg not in range(0x110000) "
8071 "(wide Python build)");
8072 return -1;
8073 }
8074#else
8075 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008076 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008077 "%c arg not in range(0x10000) "
8078 "(narrow Python build)");
8079 return -1;
8080 }
8081#endif
8082 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 }
8084 buf[1] = '\0';
8085 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008086
8087 onError:
8088 PyErr_SetString(PyExc_TypeError,
8089 "%c requires int or char");
8090 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091}
8092
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008093/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8094
8095 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8096 chars are formatted. XXX This is a magic number. Each formatting
8097 routine does bounds checking to ensure no overflow, but a better
8098 solution may be to malloc a buffer of appropriate size for each
8099 format. For now, the current solution is sufficient.
8100*/
8101#define FORMATBUFLEN (size_t)120
8102
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103PyObject *PyUnicode_Format(PyObject *format,
8104 PyObject *args)
8105{
8106 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008107 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108 int args_owned = 0;
8109 PyUnicodeObject *result = NULL;
8110 PyObject *dict = NULL;
8111 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008112
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 if (format == NULL || args == NULL) {
8114 PyErr_BadInternalCall();
8115 return NULL;
8116 }
8117 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008118 if (uformat == NULL)
8119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 fmt = PyUnicode_AS_UNICODE(uformat);
8121 fmtcnt = PyUnicode_GET_SIZE(uformat);
8122
8123 reslen = rescnt = fmtcnt + 100;
8124 result = _PyUnicode_New(reslen);
8125 if (result == NULL)
8126 goto onError;
8127 res = PyUnicode_AS_UNICODE(result);
8128
8129 if (PyTuple_Check(args)) {
8130 arglen = PyTuple_Size(args);
8131 argidx = 0;
8132 }
8133 else {
8134 arglen = -1;
8135 argidx = -2;
8136 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008137 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
8138 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 dict = args;
8140
8141 while (--fmtcnt >= 0) {
8142 if (*fmt != '%') {
8143 if (--rescnt < 0) {
8144 rescnt = fmtcnt + 100;
8145 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008146 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008147 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8149 --rescnt;
8150 }
8151 *res++ = *fmt++;
8152 }
8153 else {
8154 /* Got a format specifier */
8155 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 Py_UNICODE c = '\0';
8159 Py_UNICODE fill;
8160 PyObject *v = NULL;
8161 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008162 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008164 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008165 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166
8167 fmt++;
8168 if (*fmt == '(') {
8169 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008170 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 PyObject *key;
8172 int pcount = 1;
8173
8174 if (dict == NULL) {
8175 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008176 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177 goto onError;
8178 }
8179 ++fmt;
8180 --fmtcnt;
8181 keystart = fmt;
8182 /* Skip over balanced parentheses */
8183 while (pcount > 0 && --fmtcnt >= 0) {
8184 if (*fmt == ')')
8185 --pcount;
8186 else if (*fmt == '(')
8187 ++pcount;
8188 fmt++;
8189 }
8190 keylen = fmt - keystart - 1;
8191 if (fmtcnt < 0 || pcount > 0) {
8192 PyErr_SetString(PyExc_ValueError,
8193 "incomplete format key");
8194 goto onError;
8195 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008196#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008197 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 then looked up since Python uses strings to hold
8199 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008200 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 key = PyUnicode_EncodeUTF8(keystart,
8202 keylen,
8203 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008204#else
8205 key = PyUnicode_FromUnicode(keystart, keylen);
8206#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207 if (key == NULL)
8208 goto onError;
8209 if (args_owned) {
8210 Py_DECREF(args);
8211 args_owned = 0;
8212 }
8213 args = PyObject_GetItem(dict, key);
8214 Py_DECREF(key);
8215 if (args == NULL) {
8216 goto onError;
8217 }
8218 args_owned = 1;
8219 arglen = -1;
8220 argidx = -2;
8221 }
8222 while (--fmtcnt >= 0) {
8223 switch (c = *fmt++) {
8224 case '-': flags |= F_LJUST; continue;
8225 case '+': flags |= F_SIGN; continue;
8226 case ' ': flags |= F_BLANK; continue;
8227 case '#': flags |= F_ALT; continue;
8228 case '0': flags |= F_ZERO; continue;
8229 }
8230 break;
8231 }
8232 if (c == '*') {
8233 v = getnextarg(args, arglen, &argidx);
8234 if (v == NULL)
8235 goto onError;
8236 if (!PyInt_Check(v)) {
8237 PyErr_SetString(PyExc_TypeError,
8238 "* wants int");
8239 goto onError;
8240 }
8241 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008242 if (width == -1 && PyErr_Occurred())
8243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 if (width < 0) {
8245 flags |= F_LJUST;
8246 width = -width;
8247 }
8248 if (--fmtcnt >= 0)
8249 c = *fmt++;
8250 }
8251 else if (c >= '0' && c <= '9') {
8252 width = c - '0';
8253 while (--fmtcnt >= 0) {
8254 c = *fmt++;
8255 if (c < '0' || c > '9')
8256 break;
8257 if ((width*10) / 10 != width) {
8258 PyErr_SetString(PyExc_ValueError,
8259 "width too big");
8260 goto onError;
8261 }
8262 width = width*10 + (c - '0');
8263 }
8264 }
8265 if (c == '.') {
8266 prec = 0;
8267 if (--fmtcnt >= 0)
8268 c = *fmt++;
8269 if (c == '*') {
8270 v = getnextarg(args, arglen, &argidx);
8271 if (v == NULL)
8272 goto onError;
8273 if (!PyInt_Check(v)) {
8274 PyErr_SetString(PyExc_TypeError,
8275 "* wants int");
8276 goto onError;
8277 }
8278 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008279 if (prec == -1 && PyErr_Occurred())
8280 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 if (prec < 0)
8282 prec = 0;
8283 if (--fmtcnt >= 0)
8284 c = *fmt++;
8285 }
8286 else if (c >= '0' && c <= '9') {
8287 prec = c - '0';
8288 while (--fmtcnt >= 0) {
8289 c = Py_CHARMASK(*fmt++);
8290 if (c < '0' || c > '9')
8291 break;
8292 if ((prec*10) / 10 != prec) {
8293 PyErr_SetString(PyExc_ValueError,
8294 "prec too big");
8295 goto onError;
8296 }
8297 prec = prec*10 + (c - '0');
8298 }
8299 }
8300 } /* prec */
8301 if (fmtcnt >= 0) {
8302 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 if (--fmtcnt >= 0)
8304 c = *fmt++;
8305 }
8306 }
8307 if (fmtcnt < 0) {
8308 PyErr_SetString(PyExc_ValueError,
8309 "incomplete format");
8310 goto onError;
8311 }
8312 if (c != '%') {
8313 v = getnextarg(args, arglen, &argidx);
8314 if (v == NULL)
8315 goto onError;
8316 }
8317 sign = 0;
8318 fill = ' ';
8319 switch (c) {
8320
8321 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008322 pbuf = formatbuf;
8323 /* presume that buffer length is at least 1 */
8324 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 len = 1;
8326 break;
8327
8328 case 's':
8329 case 'r':
8330 if (PyUnicode_Check(v) && c == 's') {
8331 temp = v;
8332 Py_INCREF(temp);
8333 }
8334 else {
8335 PyObject *unicode;
8336 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008337 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 else
8339 temp = PyObject_Repr(v);
8340 if (temp == NULL)
8341 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008342 if (PyUnicode_Check(temp))
8343 /* nothing to do */;
8344 else if (PyString_Check(temp)) {
8345 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008346 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008348 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008350 Py_DECREF(temp);
8351 temp = unicode;
8352 if (temp == NULL)
8353 goto onError;
8354 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008355 else {
8356 Py_DECREF(temp);
8357 PyErr_SetString(PyExc_TypeError,
8358 "%s argument has non-string str()");
8359 goto onError;
8360 }
8361 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008362 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 len = PyUnicode_GET_SIZE(temp);
8364 if (prec >= 0 && len > prec)
8365 len = prec;
8366 break;
8367
8368 case 'i':
8369 case 'd':
8370 case 'u':
8371 case 'o':
8372 case 'x':
8373 case 'X':
8374 if (c == 'i')
8375 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008376 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008377 temp = formatlong(v, flags, prec, c);
8378 if (!temp)
8379 goto onError;
8380 pbuf = PyUnicode_AS_UNICODE(temp);
8381 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008382 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008384 else {
8385 pbuf = formatbuf;
8386 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8387 flags, prec, c, v);
8388 if (len < 0)
8389 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008390 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008391 }
8392 if (flags & F_ZERO)
8393 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 break;
8395
8396 case 'e':
8397 case 'E':
8398 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008399 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 case 'g':
8401 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008402 if (c == 'F')
8403 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008404 pbuf = formatbuf;
8405 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8406 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 if (len < 0)
8408 goto onError;
8409 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008410 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 fill = '0';
8412 break;
8413
8414 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008415 pbuf = formatbuf;
8416 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 if (len < 0)
8418 goto onError;
8419 break;
8420
8421 default:
8422 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008423 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008424 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008425 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008426 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008427 (Py_ssize_t)(fmt - 1 -
8428 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 goto onError;
8430 }
8431 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008432 if (*pbuf == '-' || *pbuf == '+') {
8433 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 len--;
8435 }
8436 else if (flags & F_SIGN)
8437 sign = '+';
8438 else if (flags & F_BLANK)
8439 sign = ' ';
8440 else
8441 sign = 0;
8442 }
8443 if (width < len)
8444 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008445 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 reslen -= rescnt;
8447 rescnt = width + fmtcnt + 100;
8448 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008449 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008450 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008451 PyErr_NoMemory();
8452 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008453 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008454 if (_PyUnicode_Resize(&result, reslen) < 0) {
8455 Py_XDECREF(temp);
8456 goto onError;
8457 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 res = PyUnicode_AS_UNICODE(result)
8459 + reslen - rescnt;
8460 }
8461 if (sign) {
8462 if (fill != ' ')
8463 *res++ = sign;
8464 rescnt--;
8465 if (width > len)
8466 width--;
8467 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008468 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008469 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008470 assert(pbuf[1] == c);
8471 if (fill != ' ') {
8472 *res++ = *pbuf++;
8473 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008474 }
Tim Petersfff53252001-04-12 18:38:48 +00008475 rescnt -= 2;
8476 width -= 2;
8477 if (width < 0)
8478 width = 0;
8479 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481 if (width > len && !(flags & F_LJUST)) {
8482 do {
8483 --rescnt;
8484 *res++ = fill;
8485 } while (--width > len);
8486 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008487 if (fill == ' ') {
8488 if (sign)
8489 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008490 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008491 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008492 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008493 *res++ = *pbuf++;
8494 *res++ = *pbuf++;
8495 }
8496 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008497 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 res += len;
8499 rescnt -= len;
8500 while (--width >= len) {
8501 --rescnt;
8502 *res++ = ' ';
8503 }
8504 if (dict && (argidx < arglen) && c != '%') {
8505 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008506 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008507 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 goto onError;
8509 }
8510 Py_XDECREF(temp);
8511 } /* '%' */
8512 } /* until end */
8513 if (argidx < arglen && !dict) {
8514 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008515 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 goto onError;
8517 }
8518
Thomas Woutersa96affe2006-03-12 00:29:36 +00008519 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8520 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 if (args_owned) {
8522 Py_DECREF(args);
8523 }
8524 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525 return (PyObject *)result;
8526
8527 onError:
8528 Py_XDECREF(result);
8529 Py_DECREF(uformat);
8530 if (args_owned) {
8531 Py_DECREF(args);
8532 }
8533 return NULL;
8534}
8535
8536static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008537 (readbufferproc) unicode_buffer_getreadbuf,
8538 (writebufferproc) unicode_buffer_getwritebuf,
8539 (segcountproc) unicode_buffer_getsegcount,
8540 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541};
8542
Jeremy Hylton938ace62002-07-17 16:30:39 +00008543static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008544unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8545
Tim Peters6d6c1a32001-08-02 04:15:00 +00008546static PyObject *
8547unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8548{
8549 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008550 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008551 char *encoding = NULL;
8552 char *errors = NULL;
8553
Guido van Rossume023fe02001-08-30 03:12:59 +00008554 if (type != &PyUnicode_Type)
8555 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008556 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8557 kwlist, &x, &encoding, &errors))
8558 return NULL;
8559 if (x == NULL)
8560 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008561 if (encoding == NULL && errors == NULL)
8562 return PyObject_Unicode(x);
8563 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008564 return PyUnicode_FromEncodedObject(x, encoding, errors);
8565}
8566
Guido van Rossume023fe02001-08-30 03:12:59 +00008567static PyObject *
8568unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8569{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008570 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008571 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008572
8573 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8574 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8575 if (tmp == NULL)
8576 return NULL;
8577 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008578 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008579 if (pnew == NULL) {
8580 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008581 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008582 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008583 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8584 if (pnew->str == NULL) {
8585 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008586 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008587 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008588 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008589 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008590 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8591 pnew->length = n;
8592 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008593 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008594 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008595}
8596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008597PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008598"unicode(string [, encoding[, errors]]) -> object\n\
8599\n\
8600Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008601encoding defaults to the current default string encoding.\n\
8602errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008603
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008604static PyObject *unicode_iter(PyObject *seq);
8605
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606PyTypeObject PyUnicode_Type = {
8607 PyObject_HEAD_INIT(&PyType_Type)
8608 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008609 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 sizeof(PyUnicodeObject), /* tp_size */
8611 0, /* tp_itemsize */
8612 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008613 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008615 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008617 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008618 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008619 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008621 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 (hashfunc) unicode_hash, /* tp_hash*/
8623 0, /* tp_call*/
8624 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008625 PyObject_GenericGetAttr, /* tp_getattro */
8626 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008628 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8629 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008630 unicode_doc, /* tp_doc */
8631 0, /* tp_traverse */
8632 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008633 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008634 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008635 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008636 0, /* tp_iternext */
8637 unicode_methods, /* tp_methods */
8638 0, /* tp_members */
8639 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008640 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008641 0, /* tp_dict */
8642 0, /* tp_descr_get */
8643 0, /* tp_descr_set */
8644 0, /* tp_dictoffset */
8645 0, /* tp_init */
8646 0, /* tp_alloc */
8647 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008648 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649};
8650
8651/* Initialize the Unicode implementation */
8652
Thomas Wouters78890102000-07-22 19:25:51 +00008653void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008655 int i;
8656
Thomas Wouters477c8d52006-05-27 19:21:47 +00008657 /* XXX - move this array to unicodectype.c ? */
8658 Py_UNICODE linebreak[] = {
8659 0x000A, /* LINE FEED */
8660 0x000D, /* CARRIAGE RETURN */
8661 0x001C, /* FILE SEPARATOR */
8662 0x001D, /* GROUP SEPARATOR */
8663 0x001E, /* RECORD SEPARATOR */
8664 0x0085, /* NEXT LINE */
8665 0x2028, /* LINE SEPARATOR */
8666 0x2029, /* PARAGRAPH SEPARATOR */
8667 };
8668
Fred Drakee4315f52000-05-09 19:53:39 +00008669 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008670 unicode_freelist = NULL;
8671 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008673 if (!unicode_empty)
8674 return;
8675
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008676 for (i = 0; i < 256; i++)
8677 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008678 if (PyType_Ready(&PyUnicode_Type) < 0)
8679 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008680
8681 /* initialize the linebreak bloom filter */
8682 bloom_linebreak = make_bloom_mask(
8683 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8684 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008685
8686 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687}
8688
8689/* Finalize the Unicode implementation */
8690
8691void
Thomas Wouters78890102000-07-22 19:25:51 +00008692_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008694 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008695 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008697 Py_XDECREF(unicode_empty);
8698 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008699
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008700 for (i = 0; i < 256; i++) {
8701 if (unicode_latin1[i]) {
8702 Py_DECREF(unicode_latin1[i]);
8703 unicode_latin1[i] = NULL;
8704 }
8705 }
8706
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008707 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 PyUnicodeObject *v = u;
8709 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008710 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008711 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008712 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008713 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008715 unicode_freelist = NULL;
8716 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008718
Walter Dörwald16807132007-05-25 13:52:07 +00008719void
8720PyUnicode_InternInPlace(PyObject **p)
8721{
8722 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8723 PyObject *t;
8724 if (s == NULL || !PyUnicode_Check(s))
8725 Py_FatalError(
8726 "PyUnicode_InternInPlace: unicode strings only please!");
8727 /* If it's a subclass, we don't really know what putting
8728 it in the interned dict might do. */
8729 if (!PyUnicode_CheckExact(s))
8730 return;
8731 if (PyUnicode_CHECK_INTERNED(s))
8732 return;
8733 if (interned == NULL) {
8734 interned = PyDict_New();
8735 if (interned == NULL) {
8736 PyErr_Clear(); /* Don't leave an exception */
8737 return;
8738 }
8739 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008740 /* It might be that the GetItem call fails even
8741 though the key is present in the dictionary,
8742 namely when this happens during a stack overflow. */
8743 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008744 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008745 Py_END_ALLOW_RECURSION
8746
Walter Dörwald16807132007-05-25 13:52:07 +00008747 if (t) {
8748 Py_INCREF(t);
8749 Py_DECREF(*p);
8750 *p = t;
8751 return;
8752 }
8753
Martin v. Löwis5b222132007-06-10 09:51:05 +00008754 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008755 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8756 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008757 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008758 return;
8759 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008760 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008761 /* The two references in interned are not counted by refcnt.
8762 The deallocator will take care of this */
8763 s->ob_refcnt -= 2;
8764 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8765}
8766
8767void
8768PyUnicode_InternImmortal(PyObject **p)
8769{
8770 PyUnicode_InternInPlace(p);
8771 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8772 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8773 Py_INCREF(*p);
8774 }
8775}
8776
8777PyObject *
8778PyUnicode_InternFromString(const char *cp)
8779{
8780 PyObject *s = PyUnicode_FromString(cp);
8781 if (s == NULL)
8782 return NULL;
8783 PyUnicode_InternInPlace(&s);
8784 return s;
8785}
8786
8787void _Py_ReleaseInternedUnicodeStrings(void)
8788{
8789 PyObject *keys;
8790 PyUnicodeObject *s;
8791 Py_ssize_t i, n;
8792 Py_ssize_t immortal_size = 0, mortal_size = 0;
8793
8794 if (interned == NULL || !PyDict_Check(interned))
8795 return;
8796 keys = PyDict_Keys(interned);
8797 if (keys == NULL || !PyList_Check(keys)) {
8798 PyErr_Clear();
8799 return;
8800 }
8801
8802 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8803 detector, interned unicode strings are not forcibly deallocated;
8804 rather, we give them their stolen references back, and then clear
8805 and DECREF the interned dict. */
8806
8807 n = PyList_GET_SIZE(keys);
8808 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8809 n);
8810 for (i = 0; i < n; i++) {
8811 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8812 switch (s->state) {
8813 case SSTATE_NOT_INTERNED:
8814 /* XXX Shouldn't happen */
8815 break;
8816 case SSTATE_INTERNED_IMMORTAL:
8817 s->ob_refcnt += 1;
8818 immortal_size += s->length;
8819 break;
8820 case SSTATE_INTERNED_MORTAL:
8821 s->ob_refcnt += 2;
8822 mortal_size += s->length;
8823 break;
8824 default:
8825 Py_FatalError("Inconsistent interned string state.");
8826 }
8827 s->state = SSTATE_NOT_INTERNED;
8828 }
8829 fprintf(stderr, "total size of all interned strings: "
8830 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8831 "mortal/immortal\n", mortal_size, immortal_size);
8832 Py_DECREF(keys);
8833 PyDict_Clear(interned);
8834 Py_DECREF(interned);
8835 interned = NULL;
8836}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008837
8838
8839/********************* Unicode Iterator **************************/
8840
8841typedef struct {
8842 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008843 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008844 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8845} unicodeiterobject;
8846
8847static void
8848unicodeiter_dealloc(unicodeiterobject *it)
8849{
8850 _PyObject_GC_UNTRACK(it);
8851 Py_XDECREF(it->it_seq);
8852 PyObject_GC_Del(it);
8853}
8854
8855static int
8856unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8857{
8858 Py_VISIT(it->it_seq);
8859 return 0;
8860}
8861
8862static PyObject *
8863unicodeiter_next(unicodeiterobject *it)
8864{
8865 PyUnicodeObject *seq;
8866 PyObject *item;
8867
8868 assert(it != NULL);
8869 seq = it->it_seq;
8870 if (seq == NULL)
8871 return NULL;
8872 assert(PyUnicode_Check(seq));
8873
8874 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008875 item = PyUnicode_FromUnicode(
8876 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008877 if (item != NULL)
8878 ++it->it_index;
8879 return item;
8880 }
8881
8882 Py_DECREF(seq);
8883 it->it_seq = NULL;
8884 return NULL;
8885}
8886
8887static PyObject *
8888unicodeiter_len(unicodeiterobject *it)
8889{
8890 Py_ssize_t len = 0;
8891 if (it->it_seq)
8892 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8893 return PyInt_FromSsize_t(len);
8894}
8895
8896PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8897
8898static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008899 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8900 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008901 {NULL, NULL} /* sentinel */
8902};
8903
8904PyTypeObject PyUnicodeIter_Type = {
8905 PyObject_HEAD_INIT(&PyType_Type)
8906 0, /* ob_size */
8907 "unicodeiterator", /* tp_name */
8908 sizeof(unicodeiterobject), /* tp_basicsize */
8909 0, /* tp_itemsize */
8910 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008911 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008912 0, /* tp_print */
8913 0, /* tp_getattr */
8914 0, /* tp_setattr */
8915 0, /* tp_compare */
8916 0, /* tp_repr */
8917 0, /* tp_as_number */
8918 0, /* tp_as_sequence */
8919 0, /* tp_as_mapping */
8920 0, /* tp_hash */
8921 0, /* tp_call */
8922 0, /* tp_str */
8923 PyObject_GenericGetAttr, /* tp_getattro */
8924 0, /* tp_setattro */
8925 0, /* tp_as_buffer */
8926 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8927 0, /* tp_doc */
8928 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8929 0, /* tp_clear */
8930 0, /* tp_richcompare */
8931 0, /* tp_weaklistoffset */
8932 PyObject_SelfIter, /* tp_iter */
8933 (iternextfunc)unicodeiter_next, /* tp_iternext */
8934 unicodeiter_methods, /* tp_methods */
8935 0,
8936};
8937
8938static PyObject *
8939unicode_iter(PyObject *seq)
8940{
8941 unicodeiterobject *it;
8942
8943 if (!PyUnicode_Check(seq)) {
8944 PyErr_BadInternalCall();
8945 return NULL;
8946 }
8947 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8948 if (it == NULL)
8949 return NULL;
8950 it->it_index = 0;
8951 Py_INCREF(seq);
8952 it->it_seq = (PyUnicodeObject *)seq;
8953 _PyObject_GC_TRACK(it);
8954 return (PyObject *)it;
8955}
8956
Martin v. Löwis5b222132007-06-10 09:51:05 +00008957size_t
8958Py_UNICODE_strlen(const Py_UNICODE *u)
8959{
8960 int res = 0;
8961 while(*u++)
8962 res++;
8963 return res;
8964}
8965
8966Py_UNICODE*
8967Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8968{
8969 Py_UNICODE *u = s1;
8970 while ((*u++ = *s2++));
8971 return s1;
8972}
8973
8974Py_UNICODE*
8975Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8976{
8977 Py_UNICODE *u = s1;
8978 while ((*u++ = *s2++))
8979 if (n-- == 0)
8980 break;
8981 return s1;
8982}
8983
8984int
8985Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8986{
8987 while (*s1 && *s2 && *s1 == *s2)
8988 s1++, s2++;
8989 if (*s1 && *s2)
8990 return (*s1 < *s2) ? -1 : +1;
8991 if (*s1)
8992 return 1;
8993 if (*s2)
8994 return -1;
8995 return 0;
8996}
8997
8998Py_UNICODE*
8999Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9000{
9001 const Py_UNICODE *p;
9002 for (p = s; *p; p++)
9003 if (*p == c)
9004 return (Py_UNICODE*)p;
9005 return NULL;
9006}
9007
9008
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009009#ifdef __cplusplus
9010}
9011#endif
9012
9013
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009014/*
9015Local variables:
9016c-basic-offset: 4
9017indent-tabs-mode: nil
9018End:
9019*/