blob: 5dc3b4150d5da3348ae547664aea4b7e3ef302ec [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
308 unicode->ob_refcnt = 3;
309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000341 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000355 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
430 some optimizations which share commonly used objects. */
431 if (u != NULL) {
432
433 /* Optimization for empty strings */
434 if (size == 0 && unicode_empty != NULL) {
435 Py_INCREF(unicode_empty);
436 return (PyObject *)unicode_empty;
437 }
438
Walter Dörwald071b9da2007-05-05 14:21:20 +0000439 /* Single characters are shared when using this constructor */
440 if (size == 1) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000441 unicode = unicode_latin1[(int)*u];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000442 if (!unicode) {
443 unicode = _PyUnicode_New(1);
444 if (!unicode)
445 return NULL;
446 unicode->str[0] = *u;
Walter Dörwaldce32db32007-05-05 14:26:59 +0000447 unicode_latin1[(int)*u] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000448 }
449 Py_INCREF(unicode);
450 return (PyObject *)unicode;
451 }
452 }
453
Walter Dörwald55507312007-05-18 13:12:10 +0000454 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000455 if (!unicode)
456 return NULL;
457
458 /* Copy the Unicode data into the new object */
459 if (u != NULL) {
Walter Dörwaldce32db32007-05-05 14:26:59 +0000460 Py_UNICODE *p = unicode->str;
Martin v. Löwis5b222132007-06-10 09:51:05 +0000461 while (size--)
462 *p++ = *u++;
463 /* Don't need to write trailing 0 because
464 that's already done by _PyUnicode_New */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000465 }
466
467 return (PyObject *)unicode;
468}
469
Walter Dörwaldd2034312007-05-18 16:29:38 +0000470PyObject *PyUnicode_FromString(const char *u)
471{
472 size_t size = strlen(u);
473 if (size > PY_SSIZE_T_MAX) {
474 PyErr_SetString(PyExc_OverflowError, "input too long");
475 return NULL;
476 }
477
478 return PyUnicode_FromStringAndSize(u, size);
479}
480
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481#ifdef HAVE_WCHAR_H
482
483PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000484 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485{
486 PyUnicodeObject *unicode;
487
488 if (w == NULL) {
489 PyErr_BadInternalCall();
490 return NULL;
491 }
492
493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the wchar_t data into the new object */
498#ifdef HAVE_USABLE_WCHAR_T
499 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000500#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000501 {
502 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000503 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000505 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 *u++ = *w++;
507 }
508#endif
509
510 return (PyObject *)unicode;
511}
512
Walter Dörwald346737f2007-05-31 10:44:43 +0000513static void
514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
515{
516 *fmt++ = '%';
517 if (width) {
518 if (zeropad)
519 *fmt++ = '0';
520 fmt += sprintf(fmt, "%d", width);
521 }
522 if (precision)
523 fmt += sprintf(fmt, ".%d", precision);
524 if (longflag)
525 *fmt++ = 'l';
526 else if (size_tflag) {
527 char *f = PY_FORMAT_SIZE_T;
528 while (*f)
529 *fmt++ = *f++;
530 }
531 *fmt++ = c;
532 *fmt = '\0';
533}
534
Walter Dörwaldd2034312007-05-18 16:29:38 +0000535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
536
537PyObject *
538PyUnicode_FromFormatV(const char *format, va_list vargs)
539{
540 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000541 Py_ssize_t callcount = 0;
542 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000543 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000545 int width = 0;
546 int precision = 0;
547 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000548 const char* f;
549 Py_UNICODE *s;
550 PyObject *string;
551 /* used by sprintf */
552 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000553 /* use abuffer instead of buffer, if we need more space
554 * (which can happen if there's a format specifier with width). */
555 char *abuffer = NULL;
556 char *realbuffer;
557 Py_ssize_t abuffersize = 0;
558 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000559 const char *copy;
560
561#ifdef VA_LIST_IS_ARRAY
562 Py_MEMCPY(count, vargs, sizeof(va_list));
563#else
564#ifdef __va_copy
565 __va_copy(count, vargs);
566#else
567 count = vargs;
568#endif
569#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 /* step 1: count the number of %S/%R format specifications
571 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
572 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000573 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000574 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 ++callcount;
576 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000577 /* step 2: allocate memory for the results of
578 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000579 if (callcount) {
580 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
581 if (!callresults) {
582 PyErr_NoMemory();
583 return NULL;
584 }
585 callresult = callresults;
586 }
587 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000588 for (f = format; *f; f++) {
589 if (*f == '%') {
590 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000591 width = 0;
592 while (isdigit(Py_CHARMASK(*f)))
593 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000594 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
595 ;
596
597 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
598 * they don't affect the amount of space we reserve.
599 */
600 if ((*f == 'l' || *f == 'z') &&
601 (f[1] == 'd' || f[1] == 'u'))
602 ++f;
603
604 switch (*f) {
605 case 'c':
606 (void)va_arg(count, int);
607 /* fall through... */
608 case '%':
609 n++;
610 break;
611 case 'd': case 'u': case 'i': case 'x':
612 (void) va_arg(count, int);
613 /* 20 bytes is enough to hold a 64-bit
614 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000615 This isn't enough for octal.
616 If a width is specified we need more
617 (which we allocate later). */
618 if (width < 20)
619 width = 20;
620 n += width;
621 if (abuffersize < width)
622 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000623 break;
624 case 's':
625 n += strlen(va_arg(count, char*));
626 break;
627 case 'U':
628 {
629 PyObject *obj = va_arg(count, PyObject *);
630 assert(obj && PyUnicode_Check(obj));
631 n += PyUnicode_GET_SIZE(obj);
632 break;
633 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000634 case 'V':
635 {
636 PyObject *obj = va_arg(count, PyObject *);
637 const char *str = va_arg(count, const char *);
638 assert(obj || str);
639 assert(!obj || PyUnicode_Check(obj));
640 if (obj)
641 n += PyUnicode_GET_SIZE(obj);
642 else
643 n += strlen(str);
644 break;
645 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000646 case 'S':
647 {
648 PyObject *obj = va_arg(count, PyObject *);
649 PyObject *str;
650 assert(obj);
651 str = PyObject_Unicode(obj);
652 if (!str)
653 goto fail;
654 n += PyUnicode_GET_SIZE(str);
655 /* Remember the str and switch to the next slot */
656 *callresult++ = str;
657 break;
658 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000659 case 'R':
660 {
661 PyObject *obj = va_arg(count, PyObject *);
662 PyObject *repr;
663 assert(obj);
664 repr = PyObject_Repr(obj);
665 if (!repr)
666 goto fail;
667 n += PyUnicode_GET_SIZE(repr);
668 /* Remember the repr and switch to the next slot */
669 *callresult++ = repr;
670 break;
671 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000672 case 'p':
673 (void) va_arg(count, int);
674 /* maximum 64-bit pointer representation:
675 * 0xffffffffffffffff
676 * so 19 characters is enough.
677 * XXX I count 18 -- what's the extra for?
678 */
679 n += 19;
680 break;
681 default:
682 /* if we stumble upon an unknown
683 formatting code, copy the rest of
684 the format string to the output
685 string. (we cannot just skip the
686 code, since there's no way to know
687 what's in the argument list) */
688 n += strlen(p);
689 goto expand;
690 }
691 } else
692 n++;
693 }
694 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000695 if (abuffersize > 20) {
696 abuffer = PyMem_Malloc(abuffersize);
697 if (!abuffer) {
698 PyErr_NoMemory();
699 goto fail;
700 }
701 realbuffer = abuffer;
702 }
703 else
704 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000705 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000707 we don't have to resize the string.
708 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000709 string = PyUnicode_FromUnicode(NULL, n);
710 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000711 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712
713 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000714 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000715
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f++;
719 int longflag = 0;
720 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 zeropad = (*f == '0');
722 /* parse the width.precision part */
723 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000725 width = (width*10) + *f++ - '0';
726 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 if (*f == '.') {
728 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000730 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000732 /* handle the long flag, but only for %ld and %lu.
733 others can be added when necessary. */
734 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
735 longflag = 1;
736 ++f;
737 }
738 /* handle the size_t flag. */
739 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
740 size_tflag = 1;
741 ++f;
742 }
743
744 switch (*f) {
745 case 'c':
746 *s++ = va_arg(vargs, int);
747 break;
748 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000753 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 sprintf(realbuffer, fmt, va_arg(vargs, int));
756 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000757 break;
758 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000762 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000763 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000764 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
766 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767 break;
768 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000769 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
770 sprintf(realbuffer, fmt, va_arg(vargs, int));
771 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000772 break;
773 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000774 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
775 sprintf(realbuffer, fmt, va_arg(vargs, int));
776 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000777 break;
778 case 's':
779 p = va_arg(vargs, char*);
780 appendstring(p);
781 break;
782 case 'U':
783 {
784 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000785 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
786 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
787 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000788 break;
789 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000790 case 'V':
791 {
792 PyObject *obj = va_arg(vargs, PyObject *);
793 const char *str = va_arg(vargs, const char *);
794 if (obj) {
795 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
796 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
797 s += size;
798 } else {
799 appendstring(str);
800 }
801 break;
802 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000803 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000804 case 'R':
805 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000806 Py_UNICODE *ucopy;
807 Py_ssize_t usize;
808 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 /* unused, since we already have the result */
810 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000811 ucopy = PyUnicode_AS_UNICODE(*callresult);
812 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000813 for (upos = 0; upos<usize;)
814 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000815 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000816 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000817 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000818 ++callresult;
819 break;
820 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000821 case 'p':
822 sprintf(buffer, "%p", va_arg(vargs, void*));
823 /* %p is ill-defined: ensure leading 0x. */
824 if (buffer[1] == 'X')
825 buffer[1] = 'x';
826 else if (buffer[1] != 'x') {
827 memmove(buffer+2, buffer, strlen(buffer)+1);
828 buffer[0] = '0';
829 buffer[1] = 'x';
830 }
831 appendstring(buffer);
832 break;
833 case '%':
834 *s++ = '%';
835 break;
836 default:
837 appendstring(p);
838 goto end;
839 }
840 } else
841 *s++ = *f;
842 }
843
844 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000845 if (callresults)
846 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000847 if (abuffer)
848 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000849 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
850 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 fail:
852 if (callresults) {
853 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000854 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000855 Py_DECREF(*callresult2);
856 ++callresult2;
857 }
858 PyMem_Free(callresults);
859 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000860 if (abuffer)
861 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000862 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863}
864
865#undef appendstring
866
867PyObject *
868PyUnicode_FromFormat(const char *format, ...)
869{
870 PyObject* ret;
871 va_list vargs;
872
873#ifdef HAVE_STDARG_PROTOTYPES
874 va_start(vargs, format);
875#else
876 va_start(vargs);
877#endif
878 ret = PyUnicode_FromFormatV(format, vargs);
879 va_end(vargs);
880 return ret;
881}
882
Martin v. Löwis18e16552006-02-15 17:27:45 +0000883Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
884 wchar_t *w,
885 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886{
887 if (unicode == NULL) {
888 PyErr_BadInternalCall();
889 return -1;
890 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000891
892 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000894 size = PyUnicode_GET_SIZE(unicode) + 1;
895
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896#ifdef HAVE_USABLE_WCHAR_T
897 memcpy(w, unicode->str, size * sizeof(wchar_t));
898#else
899 {
900 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000901 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000903 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904 *w++ = *u++;
905 }
906#endif
907
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000908 if (size > PyUnicode_GET_SIZE(unicode))
909 return PyUnicode_GET_SIZE(unicode);
910 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 return size;
912}
913
914#endif
915
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916PyObject *PyUnicode_FromOrdinal(int ordinal)
917{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000918 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000920 if (ordinal < 0 || ordinal > 0x10ffff) {
921 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000922 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000923 return NULL;
924 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000925
926#ifndef Py_UNICODE_WIDE
927 if (ordinal > 0xffff) {
928 ordinal -= 0x10000;
929 s[0] = 0xD800 | (ordinal >> 10);
930 s[1] = 0xDC00 | (ordinal & 0x3FF);
931 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000932 }
933#endif
934
Hye-Shik Chang40574832004-04-06 07:24:51 +0000935 s[0] = (Py_UNICODE)ordinal;
936 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000937}
938
Guido van Rossumd57fd912000-03-10 22:53:23 +0000939PyObject *PyUnicode_FromObject(register PyObject *obj)
940{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000941 /* XXX Perhaps we should make this API an alias of
942 PyObject_Unicode() instead ?! */
943 if (PyUnicode_CheckExact(obj)) {
944 Py_INCREF(obj);
945 return obj;
946 }
947 if (PyUnicode_Check(obj)) {
948 /* For a Unicode subtype that's not a Unicode object,
949 return a true Unicode object with the same data. */
950 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
951 PyUnicode_GET_SIZE(obj));
952 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000953 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
954}
955
956PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
957 const char *encoding,
958 const char *errors)
959{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000960 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000961 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000962 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000963
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 if (obj == NULL) {
965 PyErr_BadInternalCall();
966 return NULL;
967 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000968
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000969#if 0
970 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000971 that no encodings is given and then redirect to
972 PyObject_Unicode() which then applies the additional logic for
973 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000974
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000975 NOTE: This API should really only be used for object which
976 represent *encoded* Unicode !
977
978 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000979 if (PyUnicode_Check(obj)) {
980 if (encoding) {
981 PyErr_SetString(PyExc_TypeError,
982 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000983 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000984 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000985 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000986 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000987#else
988 if (PyUnicode_Check(obj)) {
989 PyErr_SetString(PyExc_TypeError,
990 "decoding Unicode is not supported");
991 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000992 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000993#endif
994
995 /* Coerce object */
996 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000997 s = PyString_AS_STRING(obj);
998 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000999 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001000 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1001 /* Overwrite the error message with something more useful in
1002 case of a TypeError. */
1003 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001004 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001005 "coercing to Unicode: need string or buffer, "
1006 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001007 obj->ob_type->tp_name);
1008 goto onError;
1009 }
Tim Petersced69f82003-09-16 20:30:58 +00001010
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001011 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012 if (len == 0) {
1013 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001014 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 }
Tim Petersced69f82003-09-16 20:30:58 +00001016 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001017 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001018
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001019 return v;
1020
1021 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023}
1024
1025PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001026 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001027 const char *encoding,
1028 const char *errors)
1029{
1030 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001031
1032 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001033 encoding = PyUnicode_GetDefaultEncoding();
1034
1035 /* Shortcuts for common default encodings */
1036 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001038 else if (strcmp(encoding, "latin-1") == 0)
1039 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001040#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1041 else if (strcmp(encoding, "mbcs") == 0)
1042 return PyUnicode_DecodeMBCS(s, size, errors);
1043#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001044 else if (strcmp(encoding, "ascii") == 0)
1045 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 /* Decode via the codec registry */
1048 buffer = PyBuffer_FromMemory((void *)s, size);
1049 if (buffer == NULL)
1050 goto onError;
1051 unicode = PyCodec_Decode(buffer, encoding, errors);
1052 if (unicode == NULL)
1053 goto onError;
1054 if (!PyUnicode_Check(unicode)) {
1055 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001056 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 unicode->ob_type->tp_name);
1058 Py_DECREF(unicode);
1059 goto onError;
1060 }
1061 Py_DECREF(buffer);
1062 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001063
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 onError:
1065 Py_XDECREF(buffer);
1066 return NULL;
1067}
1068
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001069PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1070 const char *encoding,
1071 const char *errors)
1072{
1073 PyObject *v;
1074
1075 if (!PyUnicode_Check(unicode)) {
1076 PyErr_BadArgument();
1077 goto onError;
1078 }
1079
1080 if (encoding == NULL)
1081 encoding = PyUnicode_GetDefaultEncoding();
1082
1083 /* Decode via the codec registry */
1084 v = PyCodec_Decode(unicode, encoding, errors);
1085 if (v == NULL)
1086 goto onError;
1087 return v;
1088
1089 onError:
1090 return NULL;
1091}
1092
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001094 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 const char *encoding,
1096 const char *errors)
1097{
1098 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001099
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 unicode = PyUnicode_FromUnicode(s, size);
1101 if (unicode == NULL)
1102 return NULL;
1103 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1104 Py_DECREF(unicode);
1105 return v;
1106}
1107
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001108PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1109 const char *encoding,
1110 const char *errors)
1111{
1112 PyObject *v;
1113
1114 if (!PyUnicode_Check(unicode)) {
1115 PyErr_BadArgument();
1116 goto onError;
1117 }
1118
1119 if (encoding == NULL)
1120 encoding = PyUnicode_GetDefaultEncoding();
1121
1122 /* Encode via the codec registry */
1123 v = PyCodec_Encode(unicode, encoding, errors);
1124 if (v == NULL)
1125 goto onError;
1126 return v;
1127
1128 onError:
1129 return NULL;
1130}
1131
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1133 const char *encoding,
1134 const char *errors)
1135{
1136 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001137
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 if (!PyUnicode_Check(unicode)) {
1139 PyErr_BadArgument();
1140 goto onError;
1141 }
Fred Drakee4315f52000-05-09 19:53:39 +00001142
Tim Petersced69f82003-09-16 20:30:58 +00001143 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001144 encoding = PyUnicode_GetDefaultEncoding();
1145
1146 /* Shortcuts for common default encodings */
1147 if (errors == NULL) {
1148 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001149 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001150 else if (strcmp(encoding, "latin-1") == 0)
1151 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001152#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1153 else if (strcmp(encoding, "mbcs") == 0)
1154 return PyUnicode_AsMBCSString(unicode);
1155#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001156 else if (strcmp(encoding, "ascii") == 0)
1157 return PyUnicode_AsASCIIString(unicode);
1158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159
1160 /* Encode via the codec registry */
1161 v = PyCodec_Encode(unicode, encoding, errors);
1162 if (v == NULL)
1163 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001164 if (!PyBytes_Check(v)) {
1165 if (PyString_Check(v)) {
1166 /* Old codec, turn it into bytes */
1167 PyObject *b = PyBytes_FromObject(v);
1168 Py_DECREF(v);
1169 return b;
1170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001172 "encoder did not return a bytes object "
1173 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1174 v->ob_type->tp_name,
1175 encoding ? encoding : "NULL",
1176 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 Py_DECREF(v);
1178 goto onError;
1179 }
1180 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001181
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 onError:
1183 return NULL;
1184}
1185
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001186PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1187 const char *errors)
1188{
1189 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001190 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001191 if (v)
1192 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001193 if (errors != NULL)
1194 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1195 if (errors == NULL) {
1196 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1197 PyUnicode_GET_SIZE(unicode),
1198 NULL);
1199 }
1200 else {
1201 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1202 }
1203 if (!b)
1204 return NULL;
1205 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1206 PyBytes_Size(b));
1207 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001208 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001209 return v;
1210}
1211
Martin v. Löwis5b222132007-06-10 09:51:05 +00001212char*
1213PyUnicode_AsString(PyObject *unicode)
1214{
1215 assert(PyUnicode_Check(unicode));
1216 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1217 if (!unicode)
1218 return NULL;
1219 return PyString_AsString(unicode);
1220}
1221
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1223{
1224 if (!PyUnicode_Check(unicode)) {
1225 PyErr_BadArgument();
1226 goto onError;
1227 }
1228 return PyUnicode_AS_UNICODE(unicode);
1229
1230 onError:
1231 return NULL;
1232}
1233
Martin v. Löwis18e16552006-02-15 17:27:45 +00001234Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235{
1236 if (!PyUnicode_Check(unicode)) {
1237 PyErr_BadArgument();
1238 goto onError;
1239 }
1240 return PyUnicode_GET_SIZE(unicode);
1241
1242 onError:
1243 return -1;
1244}
1245
Thomas Wouters78890102000-07-22 19:25:51 +00001246const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001247{
1248 return unicode_default_encoding;
1249}
1250
1251int PyUnicode_SetDefaultEncoding(const char *encoding)
1252{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001253 if (strcmp(encoding, unicode_default_encoding) != 0) {
1254 PyErr_Format(PyExc_ValueError,
1255 "Can only set default encoding to %s",
1256 unicode_default_encoding);
1257 return -1;
1258 }
Fred Drakee4315f52000-05-09 19:53:39 +00001259 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001260}
1261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001262/* error handling callback helper:
1263 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001264 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001265 and adjust various state variables.
1266 return 0 on success, -1 on error
1267*/
1268
1269static
1270int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1271 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001272 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1273 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001275 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001276
1277 PyObject *restuple = NULL;
1278 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001279 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1280 Py_ssize_t requiredsize;
1281 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001282 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001283 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001284 int res = -1;
1285
1286 if (*errorHandler == NULL) {
1287 *errorHandler = PyCodec_LookupError(errors);
1288 if (*errorHandler == NULL)
1289 goto onError;
1290 }
1291
1292 if (*exceptionObject == NULL) {
1293 *exceptionObject = PyUnicodeDecodeError_Create(
1294 encoding, input, insize, *startinpos, *endinpos, reason);
1295 if (*exceptionObject == NULL)
1296 goto onError;
1297 }
1298 else {
1299 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1300 goto onError;
1301 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1302 goto onError;
1303 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1304 goto onError;
1305 }
1306
1307 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1308 if (restuple == NULL)
1309 goto onError;
1310 if (!PyTuple_Check(restuple)) {
1311 PyErr_Format(PyExc_TypeError, &argparse[4]);
1312 goto onError;
1313 }
1314 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1315 goto onError;
1316 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001317 newpos = insize+newpos;
1318 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001319 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001320 goto onError;
1321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001322
1323 /* need more space? (at least enough for what we
1324 have+the replacement+the rest of the string (starting
1325 at the new input position), so we won't have to check space
1326 when there are no errors in the rest of the string) */
1327 repptr = PyUnicode_AS_UNICODE(repunicode);
1328 repsize = PyUnicode_GET_SIZE(repunicode);
1329 requiredsize = *outpos + repsize + insize-newpos;
1330 if (requiredsize > outsize) {
1331 if (requiredsize<2*outsize)
1332 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001333 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001334 goto onError;
1335 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1336 }
1337 *endinpos = newpos;
1338 *inptr = input + newpos;
1339 Py_UNICODE_COPY(*outptr, repptr, repsize);
1340 *outptr += repsize;
1341 *outpos += repsize;
1342 /* we made it! */
1343 res = 0;
1344
1345 onError:
1346 Py_XDECREF(restuple);
1347 return res;
1348}
1349
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001350/* --- UTF-7 Codec -------------------------------------------------------- */
1351
1352/* see RFC2152 for details */
1353
Tim Petersced69f82003-09-16 20:30:58 +00001354static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001355char utf7_special[128] = {
1356 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1357 encoded:
1358 0 - not special
1359 1 - special
1360 2 - whitespace (optional)
1361 3 - RFC2152 Set O (optional) */
1362 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1363 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1364 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1365 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1366 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1367 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1368 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1370
1371};
1372
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001373/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1374 warnings about the comparison always being false; since
1375 utf7_special[0] is 1, we can safely make that one comparison
1376 true */
1377
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001378#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001379 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001380 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001381 (encodeO && (utf7_special[(c)] == 3)))
1382
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001383#define B64(n) \
1384 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1385#define B64CHAR(c) \
1386 (isalnum(c) || (c) == '+' || (c) == '/')
1387#define UB64(c) \
1388 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1389 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001390
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001391#define ENCODE(out, ch, bits) \
1392 while (bits >= 6) { \
1393 *out++ = B64(ch >> (bits-6)); \
1394 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001395 }
1396
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001397#define DECODE(out, ch, bits, surrogate) \
1398 while (bits >= 16) { \
1399 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1400 bits -= 16; \
1401 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001402 /* We have already generated an error for the high surrogate \
1403 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001404 surrogate = 0; \
1405 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001406 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001407 it in a 16-bit character */ \
1408 surrogate = 1; \
1409 errmsg = "code pairs are not supported"; \
1410 goto utf7Error; \
1411 } else { \
1412 *out++ = outCh; \
1413 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001415
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001416PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001417 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001418 const char *errors)
1419{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001421 Py_ssize_t startinpos;
1422 Py_ssize_t endinpos;
1423 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001424 const char *e;
1425 PyUnicodeObject *unicode;
1426 Py_UNICODE *p;
1427 const char *errmsg = "";
1428 int inShift = 0;
1429 unsigned int bitsleft = 0;
1430 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001431 int surrogate = 0;
1432 PyObject *errorHandler = NULL;
1433 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001434
1435 unicode = _PyUnicode_New(size);
1436 if (!unicode)
1437 return NULL;
1438 if (size == 0)
1439 return (PyObject *)unicode;
1440
1441 p = unicode->str;
1442 e = s + size;
1443
1444 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445 Py_UNICODE ch;
1446 restart:
1447 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001448
1449 if (inShift) {
1450 if ((ch == '-') || !B64CHAR(ch)) {
1451 inShift = 0;
1452 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001453
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001454 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1455 if (bitsleft >= 6) {
1456 /* The shift sequence has a partial character in it. If
1457 bitsleft < 6 then we could just classify it as padding
1458 but that is not the case here */
1459
1460 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001461 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001462 }
1463 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001464 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001465 here so indicate the potential of a misencoded character. */
1466
1467 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1468 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1469 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001470 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001471 }
1472
1473 if (ch == '-') {
1474 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001475 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001476 inShift = 1;
1477 }
1478 } else if (SPECIAL(ch,0,0)) {
1479 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001480 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001481 } else {
1482 *p++ = ch;
1483 }
1484 } else {
1485 charsleft = (charsleft << 6) | UB64(ch);
1486 bitsleft += 6;
1487 s++;
1488 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1489 }
1490 }
1491 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001492 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493 s++;
1494 if (s < e && *s == '-') {
1495 s++;
1496 *p++ = '+';
1497 } else
1498 {
1499 inShift = 1;
1500 bitsleft = 0;
1501 }
1502 }
1503 else if (SPECIAL(ch,0,0)) {
1504 errmsg = "unexpected special character";
1505 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001506 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001507 }
1508 else {
1509 *p++ = ch;
1510 s++;
1511 }
1512 continue;
1513 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001514 outpos = p-PyUnicode_AS_UNICODE(unicode);
1515 endinpos = s-starts;
1516 if (unicode_decode_call_errorhandler(
1517 errors, &errorHandler,
1518 "utf7", errmsg,
1519 starts, size, &startinpos, &endinpos, &exc, &s,
1520 (PyObject **)&unicode, &outpos, &p))
1521 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001522 }
1523
1524 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 outpos = p-PyUnicode_AS_UNICODE(unicode);
1526 endinpos = size;
1527 if (unicode_decode_call_errorhandler(
1528 errors, &errorHandler,
1529 "utf7", "unterminated shift sequence",
1530 starts, size, &startinpos, &endinpos, &exc, &s,
1531 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001532 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001533 if (s < e)
1534 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001535 }
1536
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001537 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001538 goto onError;
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(errorHandler);
1541 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001542 return (PyObject *)unicode;
1543
1544onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 Py_XDECREF(errorHandler);
1546 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 Py_DECREF(unicode);
1548 return NULL;
1549}
1550
1551
1552PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001553 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001554 int encodeSetO,
1555 int encodeWhiteSpace,
1556 const char *errors)
1557{
1558 PyObject *v;
1559 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001562 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001563 unsigned int bitsleft = 0;
1564 unsigned long charsleft = 0;
1565 char * out;
1566 char * start;
1567
1568 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001569 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570
Walter Dörwald51ab4142007-05-05 14:43:36 +00001571 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001572 if (v == NULL)
1573 return NULL;
1574
Walter Dörwald51ab4142007-05-05 14:43:36 +00001575 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 for (;i < size; ++i) {
1577 Py_UNICODE ch = s[i];
1578
1579 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001580 if (ch == '+') {
1581 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001582 *out++ = '-';
1583 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1584 charsleft = ch;
1585 bitsleft = 16;
1586 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001587 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001589 } else {
1590 *out++ = (char) ch;
1591 }
1592 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1594 *out++ = B64(charsleft << (6-bitsleft));
1595 charsleft = 0;
1596 bitsleft = 0;
1597 /* Characters not in the BASE64 set implicitly unshift the sequence
1598 so no '-' is required, except if the character is itself a '-' */
1599 if (B64CHAR(ch) || ch == '-') {
1600 *out++ = '-';
1601 }
1602 inShift = 0;
1603 *out++ = (char) ch;
1604 } else {
1605 bitsleft += 16;
1606 charsleft = (charsleft << 16) | ch;
1607 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1608
1609 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001610 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 or '-' then the shift sequence will be terminated implicitly and we
1612 don't have to insert a '-'. */
1613
1614 if (bitsleft == 0) {
1615 if (i + 1 < size) {
1616 Py_UNICODE ch2 = s[i+1];
1617
1618 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001619
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001620 } else if (B64CHAR(ch2) || ch2 == '-') {
1621 *out++ = '-';
1622 inShift = 0;
1623 } else {
1624 inShift = 0;
1625 }
1626
1627 }
1628 else {
1629 *out++ = '-';
1630 inShift = 0;
1631 }
1632 }
Tim Petersced69f82003-09-16 20:30:58 +00001633 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001634 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001635 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001636 if (bitsleft) {
1637 *out++= B64(charsleft << (6-bitsleft) );
1638 *out++ = '-';
1639 }
1640
Walter Dörwald51ab4142007-05-05 14:43:36 +00001641 if (PyBytes_Resize(v, out - start)) {
1642 Py_DECREF(v);
1643 return NULL;
1644 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001645 return v;
1646}
1647
1648#undef SPECIAL
1649#undef B64
1650#undef B64CHAR
1651#undef UB64
1652#undef ENCODE
1653#undef DECODE
1654
Guido van Rossumd57fd912000-03-10 22:53:23 +00001655/* --- UTF-8 Codec -------------------------------------------------------- */
1656
Tim Petersced69f82003-09-16 20:30:58 +00001657static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658char utf8_code_length[256] = {
1659 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1660 illegal prefix. see RFC 2279 for details */
1661 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1662 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1665 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1666 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1673 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1674 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1675 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1676 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1677};
1678
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001680 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 const char *errors)
1682{
Walter Dörwald69652032004-09-07 20:24:22 +00001683 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1684}
1685
1686PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001687 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001688 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001689 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001690{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001691 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001693 Py_ssize_t startinpos;
1694 Py_ssize_t endinpos;
1695 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 const char *e;
1697 PyUnicodeObject *unicode;
1698 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001699 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 PyObject *errorHandler = NULL;
1701 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702
1703 /* Note: size will always be longer than the resulting Unicode
1704 character count */
1705 unicode = _PyUnicode_New(size);
1706 if (!unicode)
1707 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001708 if (size == 0) {
1709 if (consumed)
1710 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713
1714 /* Unpack UTF-8 encoded data */
1715 p = unicode->str;
1716 e = s + size;
1717
1718 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001719 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
1721 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001722 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 s++;
1724 continue;
1725 }
1726
1727 n = utf8_code_length[ch];
1728
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001729 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001730 if (consumed)
1731 break;
1732 else {
1733 errmsg = "unexpected end of data";
1734 startinpos = s-starts;
1735 endinpos = size;
1736 goto utf8Error;
1737 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001738 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
1740 switch (n) {
1741
1742 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001743 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001744 startinpos = s-starts;
1745 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001746 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747
1748 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001749 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001750 startinpos = s-starts;
1751 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001752 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001755 if ((s[1] & 0xc0) != 0x80) {
1756 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001757 startinpos = s-starts;
1758 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001759 goto utf8Error;
1760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001762 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001763 startinpos = s-starts;
1764 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001765 errmsg = "illegal encoding";
1766 goto utf8Error;
1767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001769 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 break;
1771
1772 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001773 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001774 (s[2] & 0xc0) != 0x80) {
1775 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001776 startinpos = s-starts;
1777 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001778 goto utf8Error;
1779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001781 if (ch < 0x0800) {
1782 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001783 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001784
1785 XXX For wide builds (UCS-4) we should probably try
1786 to recombine the surrogates into a single code
1787 unit.
1788 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001789 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 startinpos = s-starts;
1791 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001792 goto utf8Error;
1793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001795 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001796 break;
1797
1798 case 4:
1799 if ((s[1] & 0xc0) != 0x80 ||
1800 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001801 (s[3] & 0xc0) != 0x80) {
1802 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 startinpos = s-starts;
1804 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001805 goto utf8Error;
1806 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001807 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1808 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1809 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001810 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001811 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001812 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001813 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001814 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001815 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 startinpos = s-starts;
1817 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001818 goto utf8Error;
1819 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001820#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001821 *p++ = (Py_UNICODE)ch;
1822#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001823 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001824
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001825 /* translate from 10000..10FFFF to 0..FFFF */
1826 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001827
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001828 /* high surrogate = top 10 bits added to D800 */
1829 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001830
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001831 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001832 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001833#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 break;
1835
1836 default:
1837 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001838 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001839 startinpos = s-starts;
1840 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001841 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 }
1843 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001845
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001846 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001847 outpos = p-PyUnicode_AS_UNICODE(unicode);
1848 if (unicode_decode_call_errorhandler(
1849 errors, &errorHandler,
1850 "utf8", errmsg,
1851 starts, size, &startinpos, &endinpos, &exc, &s,
1852 (PyObject **)&unicode, &outpos, &p))
1853 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 }
Walter Dörwald69652032004-09-07 20:24:22 +00001855 if (consumed)
1856 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857
1858 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001859 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 goto onError;
1861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 Py_XDECREF(errorHandler);
1863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 return (PyObject *)unicode;
1865
1866onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 Py_XDECREF(errorHandler);
1868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 Py_DECREF(unicode);
1870 return NULL;
1871}
1872
Tim Peters602f7402002-04-27 18:03:26 +00001873/* Allocation strategy: if the string is short, convert into a stack buffer
1874 and allocate exactly as much space needed at the end. Else allocate the
1875 maximum possible needed (4 result bytes per Unicode character), and return
1876 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001877*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001878PyObject *
1879PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001880 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001881 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882{
Tim Peters602f7402002-04-27 18:03:26 +00001883#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001884
Martin v. Löwis18e16552006-02-15 17:27:45 +00001885 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001886 PyObject *v; /* result string object */
1887 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001888 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001889 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001890 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001891
Tim Peters602f7402002-04-27 18:03:26 +00001892 assert(s != NULL);
1893 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894
Tim Peters602f7402002-04-27 18:03:26 +00001895 if (size <= MAX_SHORT_UNICHARS) {
1896 /* Write into the stack buffer; nallocated can't overflow.
1897 * At the end, we'll allocate exactly as much heap space as it
1898 * turns out we need.
1899 */
1900 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1901 v = NULL; /* will allocate after we're done */
1902 p = stackbuf;
1903 }
1904 else {
1905 /* Overallocate on the heap, and give the excess back at the end. */
1906 nallocated = size * 4;
1907 if (nallocated / 4 != size) /* overflow! */
1908 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001909 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001910 if (v == NULL)
1911 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001912 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001913 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001914
Tim Peters602f7402002-04-27 18:03:26 +00001915 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001916 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001917
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001918 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001919 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001921
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001923 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001924 *p++ = (char)(0xc0 | (ch >> 6));
1925 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001927 else {
Tim Peters602f7402002-04-27 18:03:26 +00001928 /* Encode UCS2 Unicode ordinals */
1929 if (ch < 0x10000) {
1930 /* Special case: check for high surrogate */
1931 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1932 Py_UCS4 ch2 = s[i];
1933 /* Check for low surrogate and combine the two to
1934 form a UCS4 value */
1935 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001936 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001937 i++;
1938 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001939 }
Tim Peters602f7402002-04-27 18:03:26 +00001940 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001941 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001942 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001943 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1944 *p++ = (char)(0x80 | (ch & 0x3f));
1945 continue;
1946 }
1947encodeUCS4:
1948 /* Encode UCS4 Unicode ordinals */
1949 *p++ = (char)(0xf0 | (ch >> 18));
1950 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1951 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1952 *p++ = (char)(0x80 | (ch & 0x3f));
1953 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001955
Tim Peters602f7402002-04-27 18:03:26 +00001956 if (v == NULL) {
1957 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001958 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001959 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001960 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001961 }
1962 else {
1963 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001964 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001965 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001966 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001969
Tim Peters602f7402002-04-27 18:03:26 +00001970#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971}
1972
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1974{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 if (!PyUnicode_Check(unicode)) {
1976 PyErr_BadArgument();
1977 return NULL;
1978 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001979 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1980 PyUnicode_GET_SIZE(unicode),
1981 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982}
1983
1984/* --- UTF-16 Codec ------------------------------------------------------- */
1985
Tim Peters772747b2001-08-09 22:21:55 +00001986PyObject *
1987PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001988 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001989 const char *errors,
1990 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991{
Walter Dörwald69652032004-09-07 20:24:22 +00001992 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1993}
1994
1995PyObject *
1996PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001997 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001998 const char *errors,
1999 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002000 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002001{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002003 Py_ssize_t startinpos;
2004 Py_ssize_t endinpos;
2005 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 PyUnicodeObject *unicode;
2007 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002008 const unsigned char *q, *e;
2009 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002010 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002011 /* Offsets from q for retrieving byte pairs in the right order. */
2012#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2013 int ihi = 1, ilo = 0;
2014#else
2015 int ihi = 0, ilo = 1;
2016#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 PyObject *errorHandler = NULL;
2018 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019
2020 /* Note: size will always be longer than the resulting Unicode
2021 character count */
2022 unicode = _PyUnicode_New(size);
2023 if (!unicode)
2024 return NULL;
2025 if (size == 0)
2026 return (PyObject *)unicode;
2027
2028 /* Unpack UTF-16 encoded data */
2029 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002030 q = (unsigned char *)s;
2031 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032
2033 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002034 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002036 /* Check for BOM marks (U+FEFF) in the input and adjust current
2037 byte order setting accordingly. In native mode, the leading BOM
2038 mark is skipped, in all other modes, it is copied to the output
2039 stream as-is (giving a ZWNBSP character). */
2040 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002041 if (size >= 2) {
2042 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002043#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002044 if (bom == 0xFEFF) {
2045 q += 2;
2046 bo = -1;
2047 }
2048 else if (bom == 0xFFFE) {
2049 q += 2;
2050 bo = 1;
2051 }
Tim Petersced69f82003-09-16 20:30:58 +00002052#else
Walter Dörwald69652032004-09-07 20:24:22 +00002053 if (bom == 0xFEFF) {
2054 q += 2;
2055 bo = 1;
2056 }
2057 else if (bom == 0xFFFE) {
2058 q += 2;
2059 bo = -1;
2060 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002061#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002062 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064
Tim Peters772747b2001-08-09 22:21:55 +00002065 if (bo == -1) {
2066 /* force LE */
2067 ihi = 1;
2068 ilo = 0;
2069 }
2070 else if (bo == 1) {
2071 /* force BE */
2072 ihi = 0;
2073 ilo = 1;
2074 }
2075
2076 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002077 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002078 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002080 if (consumed)
2081 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 errmsg = "truncated data";
2083 startinpos = ((const char *)q)-starts;
2084 endinpos = ((const char *)e)-starts;
2085 goto utf16Error;
2086 /* The remaining input chars are ignored if the callback
2087 chooses to skip the input */
2088 }
2089 ch = (q[ihi] << 8) | q[ilo];
2090
Tim Peters772747b2001-08-09 22:21:55 +00002091 q += 2;
2092
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 if (ch < 0xD800 || ch > 0xDFFF) {
2094 *p++ = ch;
2095 continue;
2096 }
2097
2098 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002099 if (q >= e) {
2100 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 startinpos = (((const char *)q)-2)-starts;
2102 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002103 goto utf16Error;
2104 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002105 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002106 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2107 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002108 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002109#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002110 *p++ = ch;
2111 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002112#else
2113 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002114#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002115 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002116 }
2117 else {
2118 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 startinpos = (((const char *)q)-4)-starts;
2120 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002121 goto utf16Error;
2122 }
2123
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002125 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002126 startinpos = (((const char *)q)-2)-starts;
2127 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002128 /* Fall through to report the error */
2129
2130 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 outpos = p-PyUnicode_AS_UNICODE(unicode);
2132 if (unicode_decode_call_errorhandler(
2133 errors, &errorHandler,
2134 "utf16", errmsg,
2135 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2136 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002137 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 }
2139
2140 if (byteorder)
2141 *byteorder = bo;
2142
Walter Dörwald69652032004-09-07 20:24:22 +00002143 if (consumed)
2144 *consumed = (const char *)q-starts;
2145
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002147 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 goto onError;
2149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 Py_XDECREF(errorHandler);
2151 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 return (PyObject *)unicode;
2153
2154onError:
2155 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002156 Py_XDECREF(errorHandler);
2157 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 return NULL;
2159}
2160
Tim Peters772747b2001-08-09 22:21:55 +00002161PyObject *
2162PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002163 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002164 const char *errors,
2165 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166{
2167 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002168 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002169#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002170 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002171#else
2172 const int pairs = 0;
2173#endif
Tim Peters772747b2001-08-09 22:21:55 +00002174 /* Offsets from p for storing byte pairs in the right order. */
2175#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2176 int ihi = 1, ilo = 0;
2177#else
2178 int ihi = 0, ilo = 1;
2179#endif
2180
2181#define STORECHAR(CH) \
2182 do { \
2183 p[ihi] = ((CH) >> 8) & 0xff; \
2184 p[ilo] = (CH) & 0xff; \
2185 p += 2; \
2186 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002188#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002189 for (i = pairs = 0; i < size; i++)
2190 if (s[i] >= 0x10000)
2191 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002192#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002193 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002194 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 if (v == NULL)
2196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197
Walter Dörwald3cc34522007-05-04 10:48:27 +00002198 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002200 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002201 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002202 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002203
2204 if (byteorder == -1) {
2205 /* force LE */
2206 ihi = 1;
2207 ilo = 0;
2208 }
2209 else if (byteorder == 1) {
2210 /* force BE */
2211 ihi = 0;
2212 ilo = 1;
2213 }
2214
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002215 while (size-- > 0) {
2216 Py_UNICODE ch = *s++;
2217 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002218#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002219 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002220 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2221 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002223#endif
Tim Peters772747b2001-08-09 22:21:55 +00002224 STORECHAR(ch);
2225 if (ch2)
2226 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002229#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230}
2231
2232PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2233{
2234 if (!PyUnicode_Check(unicode)) {
2235 PyErr_BadArgument();
2236 return NULL;
2237 }
2238 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2239 PyUnicode_GET_SIZE(unicode),
2240 NULL,
2241 0);
2242}
2243
2244/* --- Unicode Escape Codec ----------------------------------------------- */
2245
Fredrik Lundh06d12682001-01-24 07:59:11 +00002246static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002247
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002249 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 const char *errors)
2251{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002253 Py_ssize_t startinpos;
2254 Py_ssize_t endinpos;
2255 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002258 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002260 char* message;
2261 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002262 PyObject *errorHandler = NULL;
2263 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002264
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 /* Escaped strings will always be longer than the resulting
2266 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 length after conversion to the true value.
2268 (but if the error callback returns a long replacement string
2269 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 v = _PyUnicode_New(size);
2271 if (v == NULL)
2272 goto onError;
2273 if (size == 0)
2274 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002275
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002278
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 while (s < end) {
2280 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002281 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002282 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283
2284 /* Non-escape characters are interpreted as Unicode ordinals */
2285 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002286 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287 continue;
2288 }
2289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002290 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 /* \ - Escapes */
2292 s++;
2293 switch (*s++) {
2294
2295 /* \x escapes */
2296 case '\n': break;
2297 case '\\': *p++ = '\\'; break;
2298 case '\'': *p++ = '\''; break;
2299 case '\"': *p++ = '\"'; break;
2300 case 'b': *p++ = '\b'; break;
2301 case 'f': *p++ = '\014'; break; /* FF */
2302 case 't': *p++ = '\t'; break;
2303 case 'n': *p++ = '\n'; break;
2304 case 'r': *p++ = '\r'; break;
2305 case 'v': *p++ = '\013'; break; /* VT */
2306 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2307
2308 /* \OOO (octal) escapes */
2309 case '0': case '1': case '2': case '3':
2310 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002311 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002313 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002315 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002317 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002318 break;
2319
Fredrik Lundhccc74732001-02-18 22:13:49 +00002320 /* hex escapes */
2321 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002323 digits = 2;
2324 message = "truncated \\xXX escape";
2325 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002326
Fredrik Lundhccc74732001-02-18 22:13:49 +00002327 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002329 digits = 4;
2330 message = "truncated \\uXXXX escape";
2331 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002332
Fredrik Lundhccc74732001-02-18 22:13:49 +00002333 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002334 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002335 digits = 8;
2336 message = "truncated \\UXXXXXXXX escape";
2337 hexescape:
2338 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002339 outpos = p-PyUnicode_AS_UNICODE(v);
2340 if (s+digits>end) {
2341 endinpos = size;
2342 if (unicode_decode_call_errorhandler(
2343 errors, &errorHandler,
2344 "unicodeescape", "end of string in escape sequence",
2345 starts, size, &startinpos, &endinpos, &exc, &s,
2346 (PyObject **)&v, &outpos, &p))
2347 goto onError;
2348 goto nextByte;
2349 }
2350 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002351 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002352 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002353 endinpos = (s+i+1)-starts;
2354 if (unicode_decode_call_errorhandler(
2355 errors, &errorHandler,
2356 "unicodeescape", message,
2357 starts, size, &startinpos, &endinpos, &exc, &s,
2358 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002359 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002360 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002361 }
2362 chr = (chr<<4) & ~0xF;
2363 if (c >= '0' && c <= '9')
2364 chr += c - '0';
2365 else if (c >= 'a' && c <= 'f')
2366 chr += 10 + c - 'a';
2367 else
2368 chr += 10 + c - 'A';
2369 }
2370 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002371 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002372 /* _decoding_error will have already written into the
2373 target buffer. */
2374 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002375 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002376 /* when we get here, chr is a 32-bit unicode character */
2377 if (chr <= 0xffff)
2378 /* UCS-2 character */
2379 *p++ = (Py_UNICODE) chr;
2380 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002381 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002382 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002383#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002384 *p++ = chr;
2385#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002386 chr -= 0x10000L;
2387 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002388 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002389#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002390 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002391 endinpos = s-starts;
2392 outpos = p-PyUnicode_AS_UNICODE(v);
2393 if (unicode_decode_call_errorhandler(
2394 errors, &errorHandler,
2395 "unicodeescape", "illegal Unicode character",
2396 starts, size, &startinpos, &endinpos, &exc, &s,
2397 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002398 goto onError;
2399 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002400 break;
2401
2402 /* \N{name} */
2403 case 'N':
2404 message = "malformed \\N character escape";
2405 if (ucnhash_CAPI == NULL) {
2406 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002407 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002408 m = PyImport_ImportModule("unicodedata");
2409 if (m == NULL)
2410 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002411 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002412 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002413 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002414 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002415 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002416 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002417 if (ucnhash_CAPI == NULL)
2418 goto ucnhashError;
2419 }
2420 if (*s == '{') {
2421 const char *start = s+1;
2422 /* look for the closing brace */
2423 while (*s != '}' && s < end)
2424 s++;
2425 if (s > start && s < end && *s == '}') {
2426 /* found a name. look it up in the unicode database */
2427 message = "unknown Unicode character name";
2428 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002429 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002430 goto store;
2431 }
2432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002433 endinpos = s-starts;
2434 outpos = p-PyUnicode_AS_UNICODE(v);
2435 if (unicode_decode_call_errorhandler(
2436 errors, &errorHandler,
2437 "unicodeescape", message,
2438 starts, size, &startinpos, &endinpos, &exc, &s,
2439 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002440 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002441 break;
2442
2443 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002444 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002445 message = "\\ at end of string";
2446 s--;
2447 endinpos = s-starts;
2448 outpos = p-PyUnicode_AS_UNICODE(v);
2449 if (unicode_decode_call_errorhandler(
2450 errors, &errorHandler,
2451 "unicodeescape", message,
2452 starts, size, &startinpos, &endinpos, &exc, &s,
2453 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002454 goto onError;
2455 }
2456 else {
2457 *p++ = '\\';
2458 *p++ = (unsigned char)s[-1];
2459 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002460 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002462 nextByte:
2463 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002465 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002467 Py_XDECREF(errorHandler);
2468 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002470
Fredrik Lundhccc74732001-02-18 22:13:49 +00002471ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002472 PyErr_SetString(
2473 PyExc_UnicodeError,
2474 "\\N escapes not supported (can't load unicodedata module)"
2475 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002476 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 Py_XDECREF(errorHandler);
2478 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002479 return NULL;
2480
Fredrik Lundhccc74732001-02-18 22:13:49 +00002481onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002483 Py_XDECREF(errorHandler);
2484 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 return NULL;
2486}
2487
2488/* Return a Unicode-Escape string version of the Unicode object.
2489
2490 If quotes is true, the string is enclosed in u"" or u'' quotes as
2491 appropriate.
2492
2493*/
2494
Thomas Wouters477c8d52006-05-27 19:21:47 +00002495Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2496 Py_ssize_t size,
2497 Py_UNICODE ch)
2498{
2499 /* like wcschr, but doesn't stop at NULL characters */
2500
2501 while (size-- > 0) {
2502 if (*s == ch)
2503 return s;
2504 s++;
2505 }
2506
2507 return NULL;
2508}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002509
Walter Dörwald79e913e2007-05-12 11:08:06 +00002510static const char *hexdigits = "0123456789abcdef";
2511
2512PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2513 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514{
2515 PyObject *repr;
2516 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517
Thomas Wouters89f507f2006-12-13 04:49:30 +00002518 /* XXX(nnorwitz): rather than over-allocating, it would be
2519 better to choose a different scheme. Perhaps scan the
2520 first N-chars of the string and allocate based on that size.
2521 */
2522 /* Initial allocation is based on the longest-possible unichr
2523 escape.
2524
2525 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2526 unichr, so in this case it's the longest unichr escape. In
2527 narrow (UTF-16) builds this is five chars per source unichr
2528 since there are two unichrs in the surrogate pair, so in narrow
2529 (UTF-16) builds it's not the longest unichr escape.
2530
2531 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2532 so in the narrow (UTF-16) build case it's the longest unichr
2533 escape.
2534 */
2535
Walter Dörwald79e913e2007-05-12 11:08:06 +00002536 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002537#ifdef Py_UNICODE_WIDE
2538 + 10*size
2539#else
2540 + 6*size
2541#endif
2542 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 if (repr == NULL)
2544 return NULL;
2545
Walter Dörwald79e913e2007-05-12 11:08:06 +00002546 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547
Guido van Rossumd57fd912000-03-10 22:53:23 +00002548 while (size-- > 0) {
2549 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002550
Walter Dörwald79e913e2007-05-12 11:08:06 +00002551 /* Escape backslashes */
2552 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 *p++ = '\\';
2554 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002555 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002556 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002557
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002558#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002559 /* Map 21-bit characters to '\U00xxxxxx' */
2560 else if (ch >= 0x10000) {
2561 *p++ = '\\';
2562 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002563 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2564 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2565 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2566 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2567 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2568 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2569 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2570 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002571 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002572 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002573#else
2574 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002575 else if (ch >= 0xD800 && ch < 0xDC00) {
2576 Py_UNICODE ch2;
2577 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002578
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002579 ch2 = *s++;
2580 size--;
2581 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2582 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2583 *p++ = '\\';
2584 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002585 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2586 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2587 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2588 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2589 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2590 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2591 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2592 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002593 continue;
2594 }
2595 /* Fall through: isolated surrogates are copied as-is */
2596 s--;
2597 size++;
2598 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002599#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002600
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002602 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 *p++ = '\\';
2604 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002605 *p++ = hexdigits[(ch >> 12) & 0x000F];
2606 *p++ = hexdigits[(ch >> 8) & 0x000F];
2607 *p++ = hexdigits[(ch >> 4) & 0x000F];
2608 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002610
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002611 /* Map special whitespace to '\t', \n', '\r' */
2612 else if (ch == '\t') {
2613 *p++ = '\\';
2614 *p++ = 't';
2615 }
2616 else if (ch == '\n') {
2617 *p++ = '\\';
2618 *p++ = 'n';
2619 }
2620 else if (ch == '\r') {
2621 *p++ = '\\';
2622 *p++ = 'r';
2623 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002624
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002625 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002626 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002628 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002629 *p++ = hexdigits[(ch >> 4) & 0x000F];
2630 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002631 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002632
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 /* Copy everything else as-is */
2634 else
2635 *p++ = (char) ch;
2636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637
2638 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002639 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2640 Py_DECREF(repr);
2641 return NULL;
2642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 return repr;
2644}
2645
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2647{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002648 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 if (!PyUnicode_Check(unicode)) {
2650 PyErr_BadArgument();
2651 return NULL;
2652 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002653 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2654 PyUnicode_GET_SIZE(unicode));
2655
2656 if (!s)
2657 return NULL;
2658 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2659 PyBytes_GET_SIZE(s));
2660 Py_DECREF(s);
2661 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662}
2663
2664/* --- Raw Unicode Escape Codec ------------------------------------------- */
2665
2666PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002667 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 const char *errors)
2669{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002670 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002671 Py_ssize_t startinpos;
2672 Py_ssize_t endinpos;
2673 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002675 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 const char *end;
2677 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002678 PyObject *errorHandler = NULL;
2679 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002680
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 /* Escaped strings will always be longer than the resulting
2682 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 length after conversion to the true value. (But decoding error
2684 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 v = _PyUnicode_New(size);
2686 if (v == NULL)
2687 goto onError;
2688 if (size == 0)
2689 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 end = s + size;
2692 while (s < end) {
2693 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002694 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002696 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697
2698 /* Non-escape characters are interpreted as Unicode ordinals */
2699 if (*s != '\\') {
2700 *p++ = (unsigned char)*s++;
2701 continue;
2702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002703 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704
2705 /* \u-escapes are only interpreted iff the number of leading
2706 backslashes if odd */
2707 bs = s;
2708 for (;s < end;) {
2709 if (*s != '\\')
2710 break;
2711 *p++ = (unsigned char)*s++;
2712 }
2713 if (((s - bs) & 1) == 0 ||
2714 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002715 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 continue;
2717 }
2718 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002719 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 s++;
2721
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002722 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002724 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002725 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 endinpos = s-starts;
2728 if (unicode_decode_call_errorhandler(
2729 errors, &errorHandler,
2730 "rawunicodeescape", "truncated \\uXXXX",
2731 starts, size, &startinpos, &endinpos, &exc, &s,
2732 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 }
2736 x = (x<<4) & ~0xF;
2737 if (c >= '0' && c <= '9')
2738 x += c - '0';
2739 else if (c >= 'a' && c <= 'f')
2740 x += 10 + c - 'a';
2741 else
2742 x += 10 + c - 'A';
2743 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002744#ifndef Py_UNICODE_WIDE
2745 if (x > 0x10000) {
2746 if (unicode_decode_call_errorhandler(
2747 errors, &errorHandler,
2748 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2749 starts, size, &startinpos, &endinpos, &exc, &s,
2750 (PyObject **)&v, &outpos, &p))
2751 goto onError;
2752 }
2753#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 *p++ = x;
2755 nextByte:
2756 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002758 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002759 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 Py_XDECREF(errorHandler);
2761 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002763
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 onError:
2765 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 Py_XDECREF(errorHandler);
2767 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 return NULL;
2769}
2770
2771PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002772 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773{
2774 PyObject *repr;
2775 char *p;
2776 char *q;
2777
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002778#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002779 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002780#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002781 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002782#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 if (repr == NULL)
2784 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002785 if (size == 0)
2786 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787
Walter Dörwald711005d2007-05-12 12:03:26 +00002788 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 while (size-- > 0) {
2790 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002791#ifdef Py_UNICODE_WIDE
2792 /* Map 32-bit characters to '\Uxxxxxxxx' */
2793 if (ch >= 0x10000) {
2794 *p++ = '\\';
2795 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002796 *p++ = hexdigits[(ch >> 28) & 0xf];
2797 *p++ = hexdigits[(ch >> 24) & 0xf];
2798 *p++ = hexdigits[(ch >> 20) & 0xf];
2799 *p++ = hexdigits[(ch >> 16) & 0xf];
2800 *p++ = hexdigits[(ch >> 12) & 0xf];
2801 *p++ = hexdigits[(ch >> 8) & 0xf];
2802 *p++ = hexdigits[(ch >> 4) & 0xf];
2803 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002804 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002805 else
2806#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 /* Map 16-bit characters to '\uxxxx' */
2808 if (ch >= 256) {
2809 *p++ = '\\';
2810 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002811 *p++ = hexdigits[(ch >> 12) & 0xf];
2812 *p++ = hexdigits[(ch >> 8) & 0xf];
2813 *p++ = hexdigits[(ch >> 4) & 0xf];
2814 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 }
2816 /* Copy everything else as-is */
2817 else
2818 *p++ = (char) ch;
2819 }
2820 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002821 if (PyBytes_Resize(repr, p - q)) {
2822 Py_DECREF(repr);
2823 return NULL;
2824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 return repr;
2826}
2827
2828PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2829{
Walter Dörwald711005d2007-05-12 12:03:26 +00002830 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002832 PyErr_BadArgument();
2833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002835 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2836 PyUnicode_GET_SIZE(unicode));
2837
2838 if (!s)
2839 return NULL;
2840 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2841 PyBytes_GET_SIZE(s));
2842 Py_DECREF(s);
2843 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844}
2845
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002846/* --- Unicode Internal Codec ------------------------------------------- */
2847
2848PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002849 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002850 const char *errors)
2851{
2852 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t startinpos;
2854 Py_ssize_t endinpos;
2855 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002856 PyUnicodeObject *v;
2857 Py_UNICODE *p;
2858 const char *end;
2859 const char *reason;
2860 PyObject *errorHandler = NULL;
2861 PyObject *exc = NULL;
2862
Neal Norwitzd43069c2006-01-08 01:12:10 +00002863#ifdef Py_UNICODE_WIDE
2864 Py_UNICODE unimax = PyUnicode_GetMax();
2865#endif
2866
Thomas Wouters89f507f2006-12-13 04:49:30 +00002867 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002868 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2869 if (v == NULL)
2870 goto onError;
2871 if (PyUnicode_GetSize((PyObject *)v) == 0)
2872 return (PyObject *)v;
2873 p = PyUnicode_AS_UNICODE(v);
2874 end = s + size;
2875
2876 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002877 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002878 /* We have to sanity check the raw data, otherwise doom looms for
2879 some malformed UCS-4 data. */
2880 if (
2881 #ifdef Py_UNICODE_WIDE
2882 *p > unimax || *p < 0 ||
2883 #endif
2884 end-s < Py_UNICODE_SIZE
2885 )
2886 {
2887 startinpos = s - starts;
2888 if (end-s < Py_UNICODE_SIZE) {
2889 endinpos = end-starts;
2890 reason = "truncated input";
2891 }
2892 else {
2893 endinpos = s - starts + Py_UNICODE_SIZE;
2894 reason = "illegal code point (> 0x10FFFF)";
2895 }
2896 outpos = p - PyUnicode_AS_UNICODE(v);
2897 if (unicode_decode_call_errorhandler(
2898 errors, &errorHandler,
2899 "unicode_internal", reason,
2900 starts, size, &startinpos, &endinpos, &exc, &s,
2901 (PyObject **)&v, &outpos, &p)) {
2902 goto onError;
2903 }
2904 }
2905 else {
2906 p++;
2907 s += Py_UNICODE_SIZE;
2908 }
2909 }
2910
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002911 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002912 goto onError;
2913 Py_XDECREF(errorHandler);
2914 Py_XDECREF(exc);
2915 return (PyObject *)v;
2916
2917 onError:
2918 Py_XDECREF(v);
2919 Py_XDECREF(errorHandler);
2920 Py_XDECREF(exc);
2921 return NULL;
2922}
2923
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924/* --- Latin-1 Codec ------------------------------------------------------ */
2925
2926PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002927 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928 const char *errors)
2929{
2930 PyUnicodeObject *v;
2931 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002932
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002934 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002935 Py_UNICODE r = *(unsigned char*)s;
2936 return PyUnicode_FromUnicode(&r, 1);
2937 }
2938
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 v = _PyUnicode_New(size);
2940 if (v == NULL)
2941 goto onError;
2942 if (size == 0)
2943 return (PyObject *)v;
2944 p = PyUnicode_AS_UNICODE(v);
2945 while (size-- > 0)
2946 *p++ = (unsigned char)*s++;
2947 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002948
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 onError:
2950 Py_XDECREF(v);
2951 return NULL;
2952}
2953
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002954/* create or adjust a UnicodeEncodeError */
2955static void make_encode_exception(PyObject **exceptionObject,
2956 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002957 const Py_UNICODE *unicode, Py_ssize_t size,
2958 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 if (*exceptionObject == NULL) {
2962 *exceptionObject = PyUnicodeEncodeError_Create(
2963 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 }
2965 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2967 goto onError;
2968 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2969 goto onError;
2970 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2971 goto onError;
2972 return;
2973 onError:
2974 Py_DECREF(*exceptionObject);
2975 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 }
2977}
2978
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979/* raises a UnicodeEncodeError */
2980static void raise_encode_exception(PyObject **exceptionObject,
2981 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002982 const Py_UNICODE *unicode, Py_ssize_t size,
2983 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 const char *reason)
2985{
2986 make_encode_exception(exceptionObject,
2987 encoding, unicode, size, startpos, endpos, reason);
2988 if (*exceptionObject != NULL)
2989 PyCodec_StrictErrors(*exceptionObject);
2990}
2991
2992/* error handling callback helper:
2993 build arguments, call the callback and check the arguments,
2994 put the result into newpos and return the replacement string, which
2995 has to be freed by the caller */
2996static PyObject *unicode_encode_call_errorhandler(const char *errors,
2997 PyObject **errorHandler,
2998 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002999 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3000 Py_ssize_t startpos, Py_ssize_t endpos,
3001 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003002{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003003 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004
3005 PyObject *restuple;
3006 PyObject *resunicode;
3007
3008 if (*errorHandler == NULL) {
3009 *errorHandler = PyCodec_LookupError(errors);
3010 if (*errorHandler == NULL)
3011 return NULL;
3012 }
3013
3014 make_encode_exception(exceptionObject,
3015 encoding, unicode, size, startpos, endpos, reason);
3016 if (*exceptionObject == NULL)
3017 return NULL;
3018
3019 restuple = PyObject_CallFunctionObjArgs(
3020 *errorHandler, *exceptionObject, NULL);
3021 if (restuple == NULL)
3022 return NULL;
3023 if (!PyTuple_Check(restuple)) {
3024 PyErr_Format(PyExc_TypeError, &argparse[4]);
3025 Py_DECREF(restuple);
3026 return NULL;
3027 }
3028 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3029 &resunicode, newpos)) {
3030 Py_DECREF(restuple);
3031 return NULL;
3032 }
3033 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003034 *newpos = size+*newpos;
3035 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003036 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003037 Py_DECREF(restuple);
3038 return NULL;
3039 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040 Py_INCREF(resunicode);
3041 Py_DECREF(restuple);
3042 return resunicode;
3043}
3044
3045static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003046 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047 const char *errors,
3048 int limit)
3049{
3050 /* output object */
3051 PyObject *res;
3052 /* pointers to the beginning and end+1 of input */
3053 const Py_UNICODE *startp = p;
3054 const Py_UNICODE *endp = p + size;
3055 /* pointer to the beginning of the unencodable characters */
3056 /* const Py_UNICODE *badp = NULL; */
3057 /* pointer into the output */
3058 char *str;
3059 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t respos = 0;
3061 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003062 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3063 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 PyObject *errorHandler = NULL;
3065 PyObject *exc = NULL;
3066 /* the following variable is used for caching string comparisons
3067 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3068 int known_errorHandler = -1;
3069
3070 /* allocate enough for a simple encoding without
3071 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003072 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003073 if (res == NULL)
3074 goto onError;
3075 if (size == 0)
3076 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003077 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 ressize = size;
3079
3080 while (p<endp) {
3081 Py_UNICODE c = *p;
3082
3083 /* can we encode this? */
3084 if (c<limit) {
3085 /* no overflow check, because we know that the space is enough */
3086 *str++ = (char)c;
3087 ++p;
3088 }
3089 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003090 Py_ssize_t unicodepos = p-startp;
3091 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003093 Py_ssize_t repsize;
3094 Py_ssize_t newpos;
3095 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003096 Py_UNICODE *uni2;
3097 /* startpos for collecting unencodable chars */
3098 const Py_UNICODE *collstart = p;
3099 const Py_UNICODE *collend = p;
3100 /* find all unecodable characters */
3101 while ((collend < endp) && ((*collend)>=limit))
3102 ++collend;
3103 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3104 if (known_errorHandler==-1) {
3105 if ((errors==NULL) || (!strcmp(errors, "strict")))
3106 known_errorHandler = 1;
3107 else if (!strcmp(errors, "replace"))
3108 known_errorHandler = 2;
3109 else if (!strcmp(errors, "ignore"))
3110 known_errorHandler = 3;
3111 else if (!strcmp(errors, "xmlcharrefreplace"))
3112 known_errorHandler = 4;
3113 else
3114 known_errorHandler = 0;
3115 }
3116 switch (known_errorHandler) {
3117 case 1: /* strict */
3118 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3119 goto onError;
3120 case 2: /* replace */
3121 while (collstart++<collend)
3122 *str++ = '?'; /* fall through */
3123 case 3: /* ignore */
3124 p = collend;
3125 break;
3126 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003127 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003128 /* determine replacement size (temporarily (mis)uses p) */
3129 for (p = collstart, repsize = 0; p < collend; ++p) {
3130 if (*p<10)
3131 repsize += 2+1+1;
3132 else if (*p<100)
3133 repsize += 2+2+1;
3134 else if (*p<1000)
3135 repsize += 2+3+1;
3136 else if (*p<10000)
3137 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003138#ifndef Py_UNICODE_WIDE
3139 else
3140 repsize += 2+5+1;
3141#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 else if (*p<100000)
3143 repsize += 2+5+1;
3144 else if (*p<1000000)
3145 repsize += 2+6+1;
3146 else
3147 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003148#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003149 }
3150 requiredsize = respos+repsize+(endp-collend);
3151 if (requiredsize > ressize) {
3152 if (requiredsize<2*ressize)
3153 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003154 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003155 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003156 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 ressize = requiredsize;
3158 }
3159 /* generate replacement (temporarily (mis)uses p) */
3160 for (p = collstart; p < collend; ++p) {
3161 str += sprintf(str, "&#%d;", (int)*p);
3162 }
3163 p = collend;
3164 break;
3165 default:
3166 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3167 encoding, reason, startp, size, &exc,
3168 collstart-startp, collend-startp, &newpos);
3169 if (repunicode == NULL)
3170 goto onError;
3171 /* need more space? (at least enough for what we
3172 have+the replacement+the rest of the string, so
3173 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003174 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 repsize = PyUnicode_GET_SIZE(repunicode);
3176 requiredsize = respos+repsize+(endp-collend);
3177 if (requiredsize > ressize) {
3178 if (requiredsize<2*ressize)
3179 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003180 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003181 Py_DECREF(repunicode);
3182 goto onError;
3183 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003184 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 ressize = requiredsize;
3186 }
3187 /* check if there is anything unencodable in the replacement
3188 and copy it to the output */
3189 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3190 c = *uni2;
3191 if (c >= limit) {
3192 raise_encode_exception(&exc, encoding, startp, size,
3193 unicodepos, unicodepos+1, reason);
3194 Py_DECREF(repunicode);
3195 goto onError;
3196 }
3197 *str = (char)c;
3198 }
3199 p = startp + newpos;
3200 Py_DECREF(repunicode);
3201 }
3202 }
3203 }
3204 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003205 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 if (respos<ressize)
3207 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003208 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003209 Py_XDECREF(errorHandler);
3210 Py_XDECREF(exc);
3211 return res;
3212
3213 onError:
3214 Py_XDECREF(res);
3215 Py_XDECREF(errorHandler);
3216 Py_XDECREF(exc);
3217 return NULL;
3218}
3219
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003221 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 const char *errors)
3223{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225}
3226
3227PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3228{
3229 if (!PyUnicode_Check(unicode)) {
3230 PyErr_BadArgument();
3231 return NULL;
3232 }
3233 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3234 PyUnicode_GET_SIZE(unicode),
3235 NULL);
3236}
3237
3238/* --- 7-bit ASCII Codec -------------------------------------------------- */
3239
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003241 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 const char *errors)
3243{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003244 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 PyUnicodeObject *v;
3246 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003247 Py_ssize_t startinpos;
3248 Py_ssize_t endinpos;
3249 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250 const char *e;
3251 PyObject *errorHandler = NULL;
3252 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003253
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003255 if (size == 1 && *(unsigned char*)s < 128) {
3256 Py_UNICODE r = *(unsigned char*)s;
3257 return PyUnicode_FromUnicode(&r, 1);
3258 }
Tim Petersced69f82003-09-16 20:30:58 +00003259
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 v = _PyUnicode_New(size);
3261 if (v == NULL)
3262 goto onError;
3263 if (size == 0)
3264 return (PyObject *)v;
3265 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 e = s + size;
3267 while (s < e) {
3268 register unsigned char c = (unsigned char)*s;
3269 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 ++s;
3272 }
3273 else {
3274 startinpos = s-starts;
3275 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003276 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277 if (unicode_decode_call_errorhandler(
3278 errors, &errorHandler,
3279 "ascii", "ordinal not in range(128)",
3280 starts, size, &startinpos, &endinpos, &exc, &s,
3281 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003285 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003286 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003287 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 Py_XDECREF(errorHandler);
3289 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003291
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 onError:
3293 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003294 Py_XDECREF(errorHandler);
3295 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 return NULL;
3297}
3298
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003300 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301 const char *errors)
3302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304}
3305
3306PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3307{
3308 if (!PyUnicode_Check(unicode)) {
3309 PyErr_BadArgument();
3310 return NULL;
3311 }
3312 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3313 PyUnicode_GET_SIZE(unicode),
3314 NULL);
3315}
3316
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003317#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003318
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003319/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003320
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003321#if SIZEOF_INT < SIZEOF_SSIZE_T
3322#define NEED_RETRY
3323#endif
3324
3325/* XXX This code is limited to "true" double-byte encodings, as
3326 a) it assumes an incomplete character consists of a single byte, and
3327 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3328 encodings, see IsDBCSLeadByteEx documentation. */
3329
3330static int is_dbcs_lead_byte(const char *s, int offset)
3331{
3332 const char *curr = s + offset;
3333
3334 if (IsDBCSLeadByte(*curr)) {
3335 const char *prev = CharPrev(s, curr);
3336 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3337 }
3338 return 0;
3339}
3340
3341/*
3342 * Decode MBCS string into unicode object. If 'final' is set, converts
3343 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3344 */
3345static int decode_mbcs(PyUnicodeObject **v,
3346 const char *s, /* MBCS string */
3347 int size, /* sizeof MBCS string */
3348 int final)
3349{
3350 Py_UNICODE *p;
3351 Py_ssize_t n = 0;
3352 int usize = 0;
3353
3354 assert(size >= 0);
3355
3356 /* Skip trailing lead-byte unless 'final' is set */
3357 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3358 --size;
3359
3360 /* First get the size of the result */
3361 if (size > 0) {
3362 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3363 if (usize == 0) {
3364 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3365 return -1;
3366 }
3367 }
3368
3369 if (*v == NULL) {
3370 /* Create unicode object */
3371 *v = _PyUnicode_New(usize);
3372 if (*v == NULL)
3373 return -1;
3374 }
3375 else {
3376 /* Extend unicode object */
3377 n = PyUnicode_GET_SIZE(*v);
3378 if (_PyUnicode_Resize(v, n + usize) < 0)
3379 return -1;
3380 }
3381
3382 /* Do the conversion */
3383 if (size > 0) {
3384 p = PyUnicode_AS_UNICODE(*v) + n;
3385 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3386 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3387 return -1;
3388 }
3389 }
3390
3391 return size;
3392}
3393
3394PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3395 Py_ssize_t size,
3396 const char *errors,
3397 Py_ssize_t *consumed)
3398{
3399 PyUnicodeObject *v = NULL;
3400 int done;
3401
3402 if (consumed)
3403 *consumed = 0;
3404
3405#ifdef NEED_RETRY
3406 retry:
3407 if (size > INT_MAX)
3408 done = decode_mbcs(&v, s, INT_MAX, 0);
3409 else
3410#endif
3411 done = decode_mbcs(&v, s, (int)size, !consumed);
3412
3413 if (done < 0) {
3414 Py_XDECREF(v);
3415 return NULL;
3416 }
3417
3418 if (consumed)
3419 *consumed += done;
3420
3421#ifdef NEED_RETRY
3422 if (size > INT_MAX) {
3423 s += done;
3424 size -= done;
3425 goto retry;
3426 }
3427#endif
3428
3429 return (PyObject *)v;
3430}
3431
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003432PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003433 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003434 const char *errors)
3435{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003436 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3437}
3438
3439/*
3440 * Convert unicode into string object (MBCS).
3441 * Returns 0 if succeed, -1 otherwise.
3442 */
3443static int encode_mbcs(PyObject **repr,
3444 const Py_UNICODE *p, /* unicode */
3445 int size) /* size of unicode */
3446{
3447 int mbcssize = 0;
3448 Py_ssize_t n = 0;
3449
3450 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003451
3452 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003453 if (size > 0) {
3454 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3455 if (mbcssize == 0) {
3456 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3457 return -1;
3458 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003459 }
3460
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003461 if (*repr == NULL) {
3462 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003463 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003464 if (*repr == NULL)
3465 return -1;
3466 }
3467 else {
3468 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003469 n = PyBytes_Size(*repr);
3470 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003471 return -1;
3472 }
3473
3474 /* Do the conversion */
3475 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003476 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003477 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3478 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3479 return -1;
3480 }
3481 }
3482
3483 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003484}
3485
3486PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003487 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003488 const char *errors)
3489{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003490 PyObject *repr = NULL;
3491 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003492
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003493#ifdef NEED_RETRY
3494 retry:
3495 if (size > INT_MAX)
3496 ret = encode_mbcs(&repr, p, INT_MAX);
3497 else
3498#endif
3499 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003500
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003501 if (ret < 0) {
3502 Py_XDECREF(repr);
3503 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003504 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003505
3506#ifdef NEED_RETRY
3507 if (size > INT_MAX) {
3508 p += INT_MAX;
3509 size -= INT_MAX;
3510 goto retry;
3511 }
3512#endif
3513
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003514 return repr;
3515}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003516
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003517PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3518{
3519 if (!PyUnicode_Check(unicode)) {
3520 PyErr_BadArgument();
3521 return NULL;
3522 }
3523 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3524 PyUnicode_GET_SIZE(unicode),
3525 NULL);
3526}
3527
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003528#undef NEED_RETRY
3529
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003530#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003531
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532/* --- Character Mapping Codec -------------------------------------------- */
3533
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003535 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 PyObject *mapping,
3537 const char *errors)
3538{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003540 Py_ssize_t startinpos;
3541 Py_ssize_t endinpos;
3542 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 PyUnicodeObject *v;
3545 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003546 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 PyObject *errorHandler = NULL;
3548 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003549 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003550 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003551
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 /* Default to Latin-1 */
3553 if (mapping == NULL)
3554 return PyUnicode_DecodeLatin1(s, size, errors);
3555
3556 v = _PyUnicode_New(size);
3557 if (v == NULL)
3558 goto onError;
3559 if (size == 0)
3560 return (PyObject *)v;
3561 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003563 if (PyUnicode_CheckExact(mapping)) {
3564 mapstring = PyUnicode_AS_UNICODE(mapping);
3565 maplen = PyUnicode_GET_SIZE(mapping);
3566 while (s < e) {
3567 unsigned char ch = *s;
3568 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003570 if (ch < maplen)
3571 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003573 if (x == 0xfffe) {
3574 /* undefined mapping */
3575 outpos = p-PyUnicode_AS_UNICODE(v);
3576 startinpos = s-starts;
3577 endinpos = startinpos+1;
3578 if (unicode_decode_call_errorhandler(
3579 errors, &errorHandler,
3580 "charmap", "character maps to <undefined>",
3581 starts, size, &startinpos, &endinpos, &exc, &s,
3582 (PyObject **)&v, &outpos, &p)) {
3583 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003584 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003585 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003586 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003587 *p++ = x;
3588 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003590 }
3591 else {
3592 while (s < e) {
3593 unsigned char ch = *s;
3594 PyObject *w, *x;
3595
3596 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3597 w = PyInt_FromLong((long)ch);
3598 if (w == NULL)
3599 goto onError;
3600 x = PyObject_GetItem(mapping, w);
3601 Py_DECREF(w);
3602 if (x == NULL) {
3603 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3604 /* No mapping found means: mapping is undefined. */
3605 PyErr_Clear();
3606 x = Py_None;
3607 Py_INCREF(x);
3608 } else
3609 goto onError;
3610 }
3611
3612 /* Apply mapping */
3613 if (PyInt_Check(x)) {
3614 long value = PyInt_AS_LONG(x);
3615 if (value < 0 || value > 65535) {
3616 PyErr_SetString(PyExc_TypeError,
3617 "character mapping must be in range(65536)");
3618 Py_DECREF(x);
3619 goto onError;
3620 }
3621 *p++ = (Py_UNICODE)value;
3622 }
3623 else if (x == Py_None) {
3624 /* undefined mapping */
3625 outpos = p-PyUnicode_AS_UNICODE(v);
3626 startinpos = s-starts;
3627 endinpos = startinpos+1;
3628 if (unicode_decode_call_errorhandler(
3629 errors, &errorHandler,
3630 "charmap", "character maps to <undefined>",
3631 starts, size, &startinpos, &endinpos, &exc, &s,
3632 (PyObject **)&v, &outpos, &p)) {
3633 Py_DECREF(x);
3634 goto onError;
3635 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003636 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003637 continue;
3638 }
3639 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003640 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003641
3642 if (targetsize == 1)
3643 /* 1-1 mapping */
3644 *p++ = *PyUnicode_AS_UNICODE(x);
3645
3646 else if (targetsize > 1) {
3647 /* 1-n mapping */
3648 if (targetsize > extrachars) {
3649 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003650 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3651 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003652 (targetsize << 2);
3653 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003654 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003655 if (_PyUnicode_Resize(&v,
3656 PyUnicode_GET_SIZE(v) + needed) < 0) {
3657 Py_DECREF(x);
3658 goto onError;
3659 }
3660 p = PyUnicode_AS_UNICODE(v) + oldpos;
3661 }
3662 Py_UNICODE_COPY(p,
3663 PyUnicode_AS_UNICODE(x),
3664 targetsize);
3665 p += targetsize;
3666 extrachars -= targetsize;
3667 }
3668 /* 1-0 mapping: skip the character */
3669 }
3670 else {
3671 /* wrong return value */
3672 PyErr_SetString(PyExc_TypeError,
3673 "character mapping must return integer, None or unicode");
3674 Py_DECREF(x);
3675 goto onError;
3676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003678 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 }
3681 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003682 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_XDECREF(errorHandler);
3685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003687
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 Py_XDECREF(errorHandler);
3690 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 Py_XDECREF(v);
3692 return NULL;
3693}
3694
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003695/* Charmap encoding: the lookup table */
3696
3697struct encoding_map{
3698 PyObject_HEAD
3699 unsigned char level1[32];
3700 int count2, count3;
3701 unsigned char level23[1];
3702};
3703
3704static PyObject*
3705encoding_map_size(PyObject *obj, PyObject* args)
3706{
3707 struct encoding_map *map = (struct encoding_map*)obj;
3708 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3709 128*map->count3);
3710}
3711
3712static PyMethodDef encoding_map_methods[] = {
3713 {"size", encoding_map_size, METH_NOARGS,
3714 PyDoc_STR("Return the size (in bytes) of this object") },
3715 { 0 }
3716};
3717
3718static void
3719encoding_map_dealloc(PyObject* o)
3720{
3721 PyObject_FREE(o);
3722}
3723
3724static PyTypeObject EncodingMapType = {
3725 PyObject_HEAD_INIT(NULL)
3726 0, /*ob_size*/
3727 "EncodingMap", /*tp_name*/
3728 sizeof(struct encoding_map), /*tp_basicsize*/
3729 0, /*tp_itemsize*/
3730 /* methods */
3731 encoding_map_dealloc, /*tp_dealloc*/
3732 0, /*tp_print*/
3733 0, /*tp_getattr*/
3734 0, /*tp_setattr*/
3735 0, /*tp_compare*/
3736 0, /*tp_repr*/
3737 0, /*tp_as_number*/
3738 0, /*tp_as_sequence*/
3739 0, /*tp_as_mapping*/
3740 0, /*tp_hash*/
3741 0, /*tp_call*/
3742 0, /*tp_str*/
3743 0, /*tp_getattro*/
3744 0, /*tp_setattro*/
3745 0, /*tp_as_buffer*/
3746 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3747 0, /*tp_doc*/
3748 0, /*tp_traverse*/
3749 0, /*tp_clear*/
3750 0, /*tp_richcompare*/
3751 0, /*tp_weaklistoffset*/
3752 0, /*tp_iter*/
3753 0, /*tp_iternext*/
3754 encoding_map_methods, /*tp_methods*/
3755 0, /*tp_members*/
3756 0, /*tp_getset*/
3757 0, /*tp_base*/
3758 0, /*tp_dict*/
3759 0, /*tp_descr_get*/
3760 0, /*tp_descr_set*/
3761 0, /*tp_dictoffset*/
3762 0, /*tp_init*/
3763 0, /*tp_alloc*/
3764 0, /*tp_new*/
3765 0, /*tp_free*/
3766 0, /*tp_is_gc*/
3767};
3768
3769PyObject*
3770PyUnicode_BuildEncodingMap(PyObject* string)
3771{
3772 Py_UNICODE *decode;
3773 PyObject *result;
3774 struct encoding_map *mresult;
3775 int i;
3776 int need_dict = 0;
3777 unsigned char level1[32];
3778 unsigned char level2[512];
3779 unsigned char *mlevel1, *mlevel2, *mlevel3;
3780 int count2 = 0, count3 = 0;
3781
3782 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3783 PyErr_BadArgument();
3784 return NULL;
3785 }
3786 decode = PyUnicode_AS_UNICODE(string);
3787 memset(level1, 0xFF, sizeof level1);
3788 memset(level2, 0xFF, sizeof level2);
3789
3790 /* If there isn't a one-to-one mapping of NULL to \0,
3791 or if there are non-BMP characters, we need to use
3792 a mapping dictionary. */
3793 if (decode[0] != 0)
3794 need_dict = 1;
3795 for (i = 1; i < 256; i++) {
3796 int l1, l2;
3797 if (decode[i] == 0
3798 #ifdef Py_UNICODE_WIDE
3799 || decode[i] > 0xFFFF
3800 #endif
3801 ) {
3802 need_dict = 1;
3803 break;
3804 }
3805 if (decode[i] == 0xFFFE)
3806 /* unmapped character */
3807 continue;
3808 l1 = decode[i] >> 11;
3809 l2 = decode[i] >> 7;
3810 if (level1[l1] == 0xFF)
3811 level1[l1] = count2++;
3812 if (level2[l2] == 0xFF)
3813 level2[l2] = count3++;
3814 }
3815
3816 if (count2 >= 0xFF || count3 >= 0xFF)
3817 need_dict = 1;
3818
3819 if (need_dict) {
3820 PyObject *result = PyDict_New();
3821 PyObject *key, *value;
3822 if (!result)
3823 return NULL;
3824 for (i = 0; i < 256; i++) {
3825 key = value = NULL;
3826 key = PyInt_FromLong(decode[i]);
3827 value = PyInt_FromLong(i);
3828 if (!key || !value)
3829 goto failed1;
3830 if (PyDict_SetItem(result, key, value) == -1)
3831 goto failed1;
3832 Py_DECREF(key);
3833 Py_DECREF(value);
3834 }
3835 return result;
3836 failed1:
3837 Py_XDECREF(key);
3838 Py_XDECREF(value);
3839 Py_DECREF(result);
3840 return NULL;
3841 }
3842
3843 /* Create a three-level trie */
3844 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3845 16*count2 + 128*count3 - 1);
3846 if (!result)
3847 return PyErr_NoMemory();
3848 PyObject_Init(result, &EncodingMapType);
3849 mresult = (struct encoding_map*)result;
3850 mresult->count2 = count2;
3851 mresult->count3 = count3;
3852 mlevel1 = mresult->level1;
3853 mlevel2 = mresult->level23;
3854 mlevel3 = mresult->level23 + 16*count2;
3855 memcpy(mlevel1, level1, 32);
3856 memset(mlevel2, 0xFF, 16*count2);
3857 memset(mlevel3, 0, 128*count3);
3858 count3 = 0;
3859 for (i = 1; i < 256; i++) {
3860 int o1, o2, o3, i2, i3;
3861 if (decode[i] == 0xFFFE)
3862 /* unmapped character */
3863 continue;
3864 o1 = decode[i]>>11;
3865 o2 = (decode[i]>>7) & 0xF;
3866 i2 = 16*mlevel1[o1] + o2;
3867 if (mlevel2[i2] == 0xFF)
3868 mlevel2[i2] = count3++;
3869 o3 = decode[i] & 0x7F;
3870 i3 = 128*mlevel2[i2] + o3;
3871 mlevel3[i3] = i;
3872 }
3873 return result;
3874}
3875
3876static int
3877encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3878{
3879 struct encoding_map *map = (struct encoding_map*)mapping;
3880 int l1 = c>>11;
3881 int l2 = (c>>7) & 0xF;
3882 int l3 = c & 0x7F;
3883 int i;
3884
3885#ifdef Py_UNICODE_WIDE
3886 if (c > 0xFFFF) {
3887 return -1;
3888 }
3889#endif
3890 if (c == 0)
3891 return 0;
3892 /* level 1*/
3893 i = map->level1[l1];
3894 if (i == 0xFF) {
3895 return -1;
3896 }
3897 /* level 2*/
3898 i = map->level23[16*i+l2];
3899 if (i == 0xFF) {
3900 return -1;
3901 }
3902 /* level 3 */
3903 i = map->level23[16*map->count2 + 128*i + l3];
3904 if (i == 0) {
3905 return -1;
3906 }
3907 return i;
3908}
3909
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910/* Lookup the character ch in the mapping. If the character
3911 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003912 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915 PyObject *w = PyInt_FromLong((long)c);
3916 PyObject *x;
3917
3918 if (w == NULL)
3919 return NULL;
3920 x = PyObject_GetItem(mapping, w);
3921 Py_DECREF(w);
3922 if (x == NULL) {
3923 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3924 /* No mapping found means: mapping is undefined. */
3925 PyErr_Clear();
3926 x = Py_None;
3927 Py_INCREF(x);
3928 return x;
3929 } else
3930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003932 else if (x == Py_None)
3933 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 else if (PyInt_Check(x)) {
3935 long value = PyInt_AS_LONG(x);
3936 if (value < 0 || value > 255) {
3937 PyErr_SetString(PyExc_TypeError,
3938 "character mapping must be in range(256)");
3939 Py_DECREF(x);
3940 return NULL;
3941 }
3942 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 else if (PyString_Check(x))
3945 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003948 PyErr_Format(PyExc_TypeError,
3949 "character mapping must return integer, None or str8, not %.400s",
3950 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 Py_DECREF(x);
3952 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 }
3954}
3955
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003956static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003957charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003958{
Walter Dörwald827b0552007-05-12 13:23:53 +00003959 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003960 /* exponentially overallocate to minimize reallocations */
3961 if (requiredsize < 2*outsize)
3962 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003963 if (PyBytes_Resize(outobj, requiredsize)) {
3964 Py_DECREF(outobj);
3965 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003966 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003967 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003968}
3969
3970typedef enum charmapencode_result {
3971 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3972}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003974 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975 space is available. Return a new reference to the object that
3976 was put in the output buffer, or Py_None, if the mapping was undefined
3977 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003978 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003980charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003981 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003983 PyObject *rep;
3984 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003985 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003987 if (mapping->ob_type == &EncodingMapType) {
3988 int res = encoding_map_lookup(c, mapping);
3989 Py_ssize_t requiredsize = *outpos+1;
3990 if (res == -1)
3991 return enc_FAILED;
3992 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00003993 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003994 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00003995 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003996 outstart[(*outpos)++] = (char)res;
3997 return enc_SUCCESS;
3998 }
3999
4000 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004002 return enc_EXCEPTION;
4003 else if (rep==Py_None) {
4004 Py_DECREF(rep);
4005 return enc_FAILED;
4006 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004008 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004009 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004010 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004012 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004014 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4016 }
4017 else {
4018 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004019 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4020 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004021 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004022 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004024 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004026 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 memcpy(outstart + *outpos, repchars, repsize);
4028 *outpos += repsize;
4029 }
4030 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004031 Py_DECREF(rep);
4032 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033}
4034
4035/* handle an error in PyUnicode_EncodeCharmap
4036 Return 0 on success, -1 on error */
4037static
4038int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004039 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004041 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004042 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043{
4044 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004045 Py_ssize_t repsize;
4046 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 Py_UNICODE *uni2;
4048 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004049 Py_ssize_t collstartpos = *inpos;
4050 Py_ssize_t collendpos = *inpos+1;
4051 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052 char *encoding = "charmap";
4053 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004054 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 /* find all unencodable characters */
4057 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004058 PyObject *rep;
4059 if (mapping->ob_type == &EncodingMapType) {
4060 int res = encoding_map_lookup(p[collendpos], mapping);
4061 if (res != -1)
4062 break;
4063 ++collendpos;
4064 continue;
4065 }
4066
4067 rep = charmapencode_lookup(p[collendpos], mapping);
4068 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004070 else if (rep!=Py_None) {
4071 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 break;
4073 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004074 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 ++collendpos;
4076 }
4077 /* cache callback name lookup
4078 * (if not done yet, i.e. it's the first error) */
4079 if (*known_errorHandler==-1) {
4080 if ((errors==NULL) || (!strcmp(errors, "strict")))
4081 *known_errorHandler = 1;
4082 else if (!strcmp(errors, "replace"))
4083 *known_errorHandler = 2;
4084 else if (!strcmp(errors, "ignore"))
4085 *known_errorHandler = 3;
4086 else if (!strcmp(errors, "xmlcharrefreplace"))
4087 *known_errorHandler = 4;
4088 else
4089 *known_errorHandler = 0;
4090 }
4091 switch (*known_errorHandler) {
4092 case 1: /* strict */
4093 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4094 return -1;
4095 case 2: /* replace */
4096 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4097 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004098 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004099 return -1;
4100 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004101 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4103 return -1;
4104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004105 }
4106 /* fall through */
4107 case 3: /* ignore */
4108 *inpos = collendpos;
4109 break;
4110 case 4: /* xmlcharrefreplace */
4111 /* generate replacement (temporarily (mis)uses p) */
4112 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4113 char buffer[2+29+1+1];
4114 char *cp;
4115 sprintf(buffer, "&#%d;", (int)p[collpos]);
4116 for (cp = buffer; *cp; ++cp) {
4117 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004118 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004120 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4122 return -1;
4123 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 }
4125 }
4126 *inpos = collendpos;
4127 break;
4128 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004129 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 encoding, reason, p, size, exceptionObject,
4131 collstartpos, collendpos, &newpos);
4132 if (repunicode == NULL)
4133 return -1;
4134 /* generate replacement */
4135 repsize = PyUnicode_GET_SIZE(repunicode);
4136 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4137 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004138 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 return -1;
4140 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004141 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4144 return -1;
4145 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 }
4147 *inpos = newpos;
4148 Py_DECREF(repunicode);
4149 }
4150 return 0;
4151}
4152
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004154 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 PyObject *mapping,
4156 const char *errors)
4157{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 /* output object */
4159 PyObject *res = NULL;
4160 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004161 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004163 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 PyObject *errorHandler = NULL;
4165 PyObject *exc = NULL;
4166 /* the following variable is used for caching string comparisons
4167 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4168 * 3=ignore, 4=xmlcharrefreplace */
4169 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170
4171 /* Default to Latin-1 */
4172 if (mapping == NULL)
4173 return PyUnicode_EncodeLatin1(p, size, errors);
4174
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 /* allocate enough for a simple encoding without
4176 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004177 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 if (res == NULL)
4179 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004180 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 while (inpos<size) {
4184 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004185 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004186 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004188 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 if (charmap_encoding_error(p, size, &inpos, mapping,
4190 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004191 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004192 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004193 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 else
4197 /* done with this character => adjust input position */
4198 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004202 if (respos<PyBytes_GET_SIZE(res)) {
4203 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 goto onError;
4205 }
4206 Py_XDECREF(exc);
4207 Py_XDECREF(errorHandler);
4208 return res;
4209
4210 onError:
4211 Py_XDECREF(res);
4212 Py_XDECREF(exc);
4213 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 return NULL;
4215}
4216
4217PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4218 PyObject *mapping)
4219{
4220 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4221 PyErr_BadArgument();
4222 return NULL;
4223 }
4224 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4225 PyUnicode_GET_SIZE(unicode),
4226 mapping,
4227 NULL);
4228}
4229
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230/* create or adjust a UnicodeTranslateError */
4231static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004232 const Py_UNICODE *unicode, Py_ssize_t size,
4233 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 if (*exceptionObject == NULL) {
4237 *exceptionObject = PyUnicodeTranslateError_Create(
4238 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239 }
4240 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4242 goto onError;
4243 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4244 goto onError;
4245 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4246 goto onError;
4247 return;
4248 onError:
4249 Py_DECREF(*exceptionObject);
4250 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251 }
4252}
4253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254/* raises a UnicodeTranslateError */
4255static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004256 const Py_UNICODE *unicode, Py_ssize_t size,
4257 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 const char *reason)
4259{
4260 make_translate_exception(exceptionObject,
4261 unicode, size, startpos, endpos, reason);
4262 if (*exceptionObject != NULL)
4263 PyCodec_StrictErrors(*exceptionObject);
4264}
4265
4266/* error handling callback helper:
4267 build arguments, call the callback and check the arguments,
4268 put the result into newpos and return the replacement string, which
4269 has to be freed by the caller */
4270static PyObject *unicode_translate_call_errorhandler(const char *errors,
4271 PyObject **errorHandler,
4272 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004273 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4274 Py_ssize_t startpos, Py_ssize_t endpos,
4275 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004277 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004279 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 PyObject *restuple;
4281 PyObject *resunicode;
4282
4283 if (*errorHandler == NULL) {
4284 *errorHandler = PyCodec_LookupError(errors);
4285 if (*errorHandler == NULL)
4286 return NULL;
4287 }
4288
4289 make_translate_exception(exceptionObject,
4290 unicode, size, startpos, endpos, reason);
4291 if (*exceptionObject == NULL)
4292 return NULL;
4293
4294 restuple = PyObject_CallFunctionObjArgs(
4295 *errorHandler, *exceptionObject, NULL);
4296 if (restuple == NULL)
4297 return NULL;
4298 if (!PyTuple_Check(restuple)) {
4299 PyErr_Format(PyExc_TypeError, &argparse[4]);
4300 Py_DECREF(restuple);
4301 return NULL;
4302 }
4303 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004304 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 Py_DECREF(restuple);
4306 return NULL;
4307 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004308 if (i_newpos<0)
4309 *newpos = size+i_newpos;
4310 else
4311 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004312 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004313 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004314 Py_DECREF(restuple);
4315 return NULL;
4316 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 Py_INCREF(resunicode);
4318 Py_DECREF(restuple);
4319 return resunicode;
4320}
4321
4322/* Lookup the character ch in the mapping and put the result in result,
4323 which must be decrefed by the caller.
4324 Return 0 on success, -1 on error */
4325static
4326int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4327{
4328 PyObject *w = PyInt_FromLong((long)c);
4329 PyObject *x;
4330
4331 if (w == NULL)
4332 return -1;
4333 x = PyObject_GetItem(mapping, w);
4334 Py_DECREF(w);
4335 if (x == NULL) {
4336 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4337 /* No mapping found means: use 1:1 mapping. */
4338 PyErr_Clear();
4339 *result = NULL;
4340 return 0;
4341 } else
4342 return -1;
4343 }
4344 else if (x == Py_None) {
4345 *result = x;
4346 return 0;
4347 }
4348 else if (PyInt_Check(x)) {
4349 long value = PyInt_AS_LONG(x);
4350 long max = PyUnicode_GetMax();
4351 if (value < 0 || value > max) {
4352 PyErr_Format(PyExc_TypeError,
4353 "character mapping must be in range(0x%lx)", max+1);
4354 Py_DECREF(x);
4355 return -1;
4356 }
4357 *result = x;
4358 return 0;
4359 }
4360 else if (PyUnicode_Check(x)) {
4361 *result = x;
4362 return 0;
4363 }
4364 else {
4365 /* wrong return value */
4366 PyErr_SetString(PyExc_TypeError,
4367 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004368 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369 return -1;
4370 }
4371}
4372/* ensure that *outobj is at least requiredsize characters long,
4373if not reallocate and adjust various state variables.
4374Return 0 on success, -1 on error */
4375static
Walter Dörwald4894c302003-10-24 14:25:28 +00004376int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004377 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004379 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004380 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004382 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004384 if (requiredsize < 2 * oldsize)
4385 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004386 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 return -1;
4388 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 }
4390 return 0;
4391}
4392/* lookup the character, put the result in the output string and adjust
4393 various state variables. Return a new reference to the object that
4394 was put in the output buffer in *result, or Py_None, if the mapping was
4395 undefined (in which case no character was written).
4396 The called must decref result.
4397 Return 0 on success, -1 on error. */
4398static
Walter Dörwald4894c302003-10-24 14:25:28 +00004399int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004400 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004401 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402{
Walter Dörwald4894c302003-10-24 14:25:28 +00004403 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 return -1;
4405 if (*res==NULL) {
4406 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004407 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 }
4409 else if (*res==Py_None)
4410 ;
4411 else if (PyInt_Check(*res)) {
4412 /* no overflow check, because we know that the space is enough */
4413 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4414 }
4415 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004416 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 if (repsize==1) {
4418 /* no overflow check, because we know that the space is enough */
4419 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4420 }
4421 else if (repsize!=0) {
4422 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004423 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004424 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004425 repsize - 1;
4426 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 return -1;
4428 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4429 *outp += repsize;
4430 }
4431 }
4432 else
4433 return -1;
4434 return 0;
4435}
4436
4437PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004438 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 PyObject *mapping,
4440 const char *errors)
4441{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 /* output object */
4443 PyObject *res = NULL;
4444 /* pointers to the beginning and end+1 of input */
4445 const Py_UNICODE *startp = p;
4446 const Py_UNICODE *endp = p + size;
4447 /* pointer into the output */
4448 Py_UNICODE *str;
4449 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004450 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 char *reason = "character maps to <undefined>";
4452 PyObject *errorHandler = NULL;
4453 PyObject *exc = NULL;
4454 /* the following variable is used for caching string comparisons
4455 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4456 * 3=ignore, 4=xmlcharrefreplace */
4457 int known_errorHandler = -1;
4458
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 if (mapping == NULL) {
4460 PyErr_BadArgument();
4461 return NULL;
4462 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463
4464 /* allocate enough for a simple 1:1 translation without
4465 replacements, if we need more, we'll resize */
4466 res = PyUnicode_FromUnicode(NULL, size);
4467 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004468 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 return res;
4471 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 while (p<endp) {
4474 /* try to encode it */
4475 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004476 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 goto onError;
4479 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004480 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 if (x!=Py_None) /* it worked => adjust input pointer */
4482 ++p;
4483 else { /* untranslatable character */
4484 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004485 Py_ssize_t repsize;
4486 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 Py_UNICODE *uni2;
4488 /* startpos for collecting untranslatable chars */
4489 const Py_UNICODE *collstart = p;
4490 const Py_UNICODE *collend = p+1;
4491 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 /* find all untranslatable characters */
4494 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004495 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 goto onError;
4497 Py_XDECREF(x);
4498 if (x!=Py_None)
4499 break;
4500 ++collend;
4501 }
4502 /* cache callback name lookup
4503 * (if not done yet, i.e. it's the first error) */
4504 if (known_errorHandler==-1) {
4505 if ((errors==NULL) || (!strcmp(errors, "strict")))
4506 known_errorHandler = 1;
4507 else if (!strcmp(errors, "replace"))
4508 known_errorHandler = 2;
4509 else if (!strcmp(errors, "ignore"))
4510 known_errorHandler = 3;
4511 else if (!strcmp(errors, "xmlcharrefreplace"))
4512 known_errorHandler = 4;
4513 else
4514 known_errorHandler = 0;
4515 }
4516 switch (known_errorHandler) {
4517 case 1: /* strict */
4518 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4519 goto onError;
4520 case 2: /* replace */
4521 /* No need to check for space, this is a 1:1 replacement */
4522 for (coll = collstart; coll<collend; ++coll)
4523 *str++ = '?';
4524 /* fall through */
4525 case 3: /* ignore */
4526 p = collend;
4527 break;
4528 case 4: /* xmlcharrefreplace */
4529 /* generate replacement (temporarily (mis)uses p) */
4530 for (p = collstart; p < collend; ++p) {
4531 char buffer[2+29+1+1];
4532 char *cp;
4533 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004534 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4536 goto onError;
4537 for (cp = buffer; *cp; ++cp)
4538 *str++ = *cp;
4539 }
4540 p = collend;
4541 break;
4542 default:
4543 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4544 reason, startp, size, &exc,
4545 collstart-startp, collend-startp, &newpos);
4546 if (repunicode == NULL)
4547 goto onError;
4548 /* generate replacement */
4549 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004550 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4552 Py_DECREF(repunicode);
4553 goto onError;
4554 }
4555 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4556 *str++ = *uni2;
4557 p = startp + newpos;
4558 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559 }
4560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 /* Resize if we allocated to much */
4563 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004564 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004565 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004566 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004567 }
4568 Py_XDECREF(exc);
4569 Py_XDECREF(errorHandler);
4570 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 onError:
4573 Py_XDECREF(res);
4574 Py_XDECREF(exc);
4575 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 return NULL;
4577}
4578
4579PyObject *PyUnicode_Translate(PyObject *str,
4580 PyObject *mapping,
4581 const char *errors)
4582{
4583 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004584
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 str = PyUnicode_FromObject(str);
4586 if (str == NULL)
4587 goto onError;
4588 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4589 PyUnicode_GET_SIZE(str),
4590 mapping,
4591 errors);
4592 Py_DECREF(str);
4593 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004594
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595 onError:
4596 Py_XDECREF(str);
4597 return NULL;
4598}
Tim Petersced69f82003-09-16 20:30:58 +00004599
Guido van Rossum9e896b32000-04-05 20:11:21 +00004600/* --- Decimal Encoder ---------------------------------------------------- */
4601
4602int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004603 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004604 char *output,
4605 const char *errors)
4606{
4607 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 PyObject *errorHandler = NULL;
4609 PyObject *exc = NULL;
4610 const char *encoding = "decimal";
4611 const char *reason = "invalid decimal Unicode string";
4612 /* the following variable is used for caching string comparisons
4613 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4614 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004615
4616 if (output == NULL) {
4617 PyErr_BadArgument();
4618 return -1;
4619 }
4620
4621 p = s;
4622 end = s + length;
4623 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004625 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004627 Py_ssize_t repsize;
4628 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 Py_UNICODE *uni2;
4630 Py_UNICODE *collstart;
4631 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004632
Guido van Rossum9e896b32000-04-05 20:11:21 +00004633 if (Py_UNICODE_ISSPACE(ch)) {
4634 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004635 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004636 continue;
4637 }
4638 decimal = Py_UNICODE_TODECIMAL(ch);
4639 if (decimal >= 0) {
4640 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004642 continue;
4643 }
Guido van Rossumba477042000-04-06 18:18:10 +00004644 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004645 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004647 continue;
4648 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 /* All other characters are considered unencodable */
4650 collstart = p;
4651 collend = p+1;
4652 while (collend < end) {
4653 if ((0 < *collend && *collend < 256) ||
4654 !Py_UNICODE_ISSPACE(*collend) ||
4655 Py_UNICODE_TODECIMAL(*collend))
4656 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004657 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 /* cache callback name lookup
4659 * (if not done yet, i.e. it's the first error) */
4660 if (known_errorHandler==-1) {
4661 if ((errors==NULL) || (!strcmp(errors, "strict")))
4662 known_errorHandler = 1;
4663 else if (!strcmp(errors, "replace"))
4664 known_errorHandler = 2;
4665 else if (!strcmp(errors, "ignore"))
4666 known_errorHandler = 3;
4667 else if (!strcmp(errors, "xmlcharrefreplace"))
4668 known_errorHandler = 4;
4669 else
4670 known_errorHandler = 0;
4671 }
4672 switch (known_errorHandler) {
4673 case 1: /* strict */
4674 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4675 goto onError;
4676 case 2: /* replace */
4677 for (p = collstart; p < collend; ++p)
4678 *output++ = '?';
4679 /* fall through */
4680 case 3: /* ignore */
4681 p = collend;
4682 break;
4683 case 4: /* xmlcharrefreplace */
4684 /* generate replacement (temporarily (mis)uses p) */
4685 for (p = collstart; p < collend; ++p)
4686 output += sprintf(output, "&#%d;", (int)*p);
4687 p = collend;
4688 break;
4689 default:
4690 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4691 encoding, reason, s, length, &exc,
4692 collstart-s, collend-s, &newpos);
4693 if (repunicode == NULL)
4694 goto onError;
4695 /* generate replacement */
4696 repsize = PyUnicode_GET_SIZE(repunicode);
4697 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4698 Py_UNICODE ch = *uni2;
4699 if (Py_UNICODE_ISSPACE(ch))
4700 *output++ = ' ';
4701 else {
4702 decimal = Py_UNICODE_TODECIMAL(ch);
4703 if (decimal >= 0)
4704 *output++ = '0' + decimal;
4705 else if (0 < ch && ch < 256)
4706 *output++ = (char)ch;
4707 else {
4708 Py_DECREF(repunicode);
4709 raise_encode_exception(&exc, encoding,
4710 s, length, collstart-s, collend-s, reason);
4711 goto onError;
4712 }
4713 }
4714 }
4715 p = s + newpos;
4716 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004717 }
4718 }
4719 /* 0-terminate the output string */
4720 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 Py_XDECREF(exc);
4722 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004723 return 0;
4724
4725 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726 Py_XDECREF(exc);
4727 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004728 return -1;
4729}
4730
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731/* --- Helpers ------------------------------------------------------------ */
4732
Thomas Wouters477c8d52006-05-27 19:21:47 +00004733#define STRINGLIB_CHAR Py_UNICODE
4734
4735#define STRINGLIB_LEN PyUnicode_GET_SIZE
4736#define STRINGLIB_NEW PyUnicode_FromUnicode
4737#define STRINGLIB_STR PyUnicode_AS_UNICODE
4738
4739Py_LOCAL_INLINE(int)
4740STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004742 if (str[0] != other[0])
4743 return 1;
4744 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745}
4746
Thomas Wouters477c8d52006-05-27 19:21:47 +00004747#define STRINGLIB_EMPTY unicode_empty
4748
4749#include "stringlib/fastsearch.h"
4750
4751#include "stringlib/count.h"
4752#include "stringlib/find.h"
4753#include "stringlib/partition.h"
4754
4755/* helper macro to fixup start/end slice values */
4756#define FIX_START_END(obj) \
4757 if (start < 0) \
4758 start += (obj)->length; \
4759 if (start < 0) \
4760 start = 0; \
4761 if (end > (obj)->length) \
4762 end = (obj)->length; \
4763 if (end < 0) \
4764 end += (obj)->length; \
4765 if (end < 0) \
4766 end = 0;
4767
Martin v. Löwis18e16552006-02-15 17:27:45 +00004768Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004769 PyObject *substr,
4770 Py_ssize_t start,
4771 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004773 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004774 PyUnicodeObject* str_obj;
4775 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004776
Thomas Wouters477c8d52006-05-27 19:21:47 +00004777 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4778 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004780 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4781 if (!sub_obj) {
4782 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 return -1;
4784 }
Tim Petersced69f82003-09-16 20:30:58 +00004785
Thomas Wouters477c8d52006-05-27 19:21:47 +00004786 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004787
Thomas Wouters477c8d52006-05-27 19:21:47 +00004788 result = stringlib_count(
4789 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4790 );
4791
4792 Py_DECREF(sub_obj);
4793 Py_DECREF(str_obj);
4794
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795 return result;
4796}
4797
Martin v. Löwis18e16552006-02-15 17:27:45 +00004798Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004799 PyObject *sub,
4800 Py_ssize_t start,
4801 Py_ssize_t end,
4802 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004804 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004805
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004807 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004808 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004809 sub = PyUnicode_FromObject(sub);
4810 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004811 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004812 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 }
Tim Petersced69f82003-09-16 20:30:58 +00004814
Thomas Wouters477c8d52006-05-27 19:21:47 +00004815 if (direction > 0)
4816 result = stringlib_find_slice(
4817 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4818 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4819 start, end
4820 );
4821 else
4822 result = stringlib_rfind_slice(
4823 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4824 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4825 start, end
4826 );
4827
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004829 Py_DECREF(sub);
4830
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 return result;
4832}
4833
Tim Petersced69f82003-09-16 20:30:58 +00004834static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835int tailmatch(PyUnicodeObject *self,
4836 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004837 Py_ssize_t start,
4838 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 int direction)
4840{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 if (substring->length == 0)
4842 return 1;
4843
Thomas Wouters477c8d52006-05-27 19:21:47 +00004844 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845
4846 end -= substring->length;
4847 if (end < start)
4848 return 0;
4849
4850 if (direction > 0) {
4851 if (Py_UNICODE_MATCH(self, end, substring))
4852 return 1;
4853 } else {
4854 if (Py_UNICODE_MATCH(self, start, substring))
4855 return 1;
4856 }
4857
4858 return 0;
4859}
4860
Martin v. Löwis18e16552006-02-15 17:27:45 +00004861Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004863 Py_ssize_t start,
4864 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 int direction)
4866{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004867 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004868
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 str = PyUnicode_FromObject(str);
4870 if (str == NULL)
4871 return -1;
4872 substr = PyUnicode_FromObject(substr);
4873 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004874 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 return -1;
4876 }
Tim Petersced69f82003-09-16 20:30:58 +00004877
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 result = tailmatch((PyUnicodeObject *)str,
4879 (PyUnicodeObject *)substr,
4880 start, end, direction);
4881 Py_DECREF(str);
4882 Py_DECREF(substr);
4883 return result;
4884}
4885
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886/* Apply fixfct filter to the Unicode object self and return a
4887 reference to the modified object */
4888
Tim Petersced69f82003-09-16 20:30:58 +00004889static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890PyObject *fixup(PyUnicodeObject *self,
4891 int (*fixfct)(PyUnicodeObject *s))
4892{
4893
4894 PyUnicodeObject *u;
4895
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004896 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 if (u == NULL)
4898 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004899
4900 Py_UNICODE_COPY(u->str, self->str, self->length);
4901
Tim Peters7a29bd52001-09-12 03:03:31 +00004902 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 /* fixfct should return TRUE if it modified the buffer. If
4904 FALSE, return a reference to the original buffer instead
4905 (to save space, not time) */
4906 Py_INCREF(self);
4907 Py_DECREF(u);
4908 return (PyObject*) self;
4909 }
4910 return (PyObject*) u;
4911}
4912
Tim Petersced69f82003-09-16 20:30:58 +00004913static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914int fixupper(PyUnicodeObject *self)
4915{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004916 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 Py_UNICODE *s = self->str;
4918 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004919
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 while (len-- > 0) {
4921 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004922
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 ch = Py_UNICODE_TOUPPER(*s);
4924 if (ch != *s) {
4925 status = 1;
4926 *s = ch;
4927 }
4928 s++;
4929 }
4930
4931 return status;
4932}
4933
Tim Petersced69f82003-09-16 20:30:58 +00004934static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935int fixlower(PyUnicodeObject *self)
4936{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004937 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 Py_UNICODE *s = self->str;
4939 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004940
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 while (len-- > 0) {
4942 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004943
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944 ch = Py_UNICODE_TOLOWER(*s);
4945 if (ch != *s) {
4946 status = 1;
4947 *s = ch;
4948 }
4949 s++;
4950 }
4951
4952 return status;
4953}
4954
Tim Petersced69f82003-09-16 20:30:58 +00004955static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956int fixswapcase(PyUnicodeObject *self)
4957{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004958 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 Py_UNICODE *s = self->str;
4960 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004961
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 while (len-- > 0) {
4963 if (Py_UNICODE_ISUPPER(*s)) {
4964 *s = Py_UNICODE_TOLOWER(*s);
4965 status = 1;
4966 } else if (Py_UNICODE_ISLOWER(*s)) {
4967 *s = Py_UNICODE_TOUPPER(*s);
4968 status = 1;
4969 }
4970 s++;
4971 }
4972
4973 return status;
4974}
4975
Tim Petersced69f82003-09-16 20:30:58 +00004976static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977int fixcapitalize(PyUnicodeObject *self)
4978{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004979 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004980 Py_UNICODE *s = self->str;
4981 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004982
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004983 if (len == 0)
4984 return 0;
4985 if (Py_UNICODE_ISLOWER(*s)) {
4986 *s = Py_UNICODE_TOUPPER(*s);
4987 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004989 s++;
4990 while (--len > 0) {
4991 if (Py_UNICODE_ISUPPER(*s)) {
4992 *s = Py_UNICODE_TOLOWER(*s);
4993 status = 1;
4994 }
4995 s++;
4996 }
4997 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998}
4999
5000static
5001int fixtitle(PyUnicodeObject *self)
5002{
5003 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5004 register Py_UNICODE *e;
5005 int previous_is_cased;
5006
5007 /* Shortcut for single character strings */
5008 if (PyUnicode_GET_SIZE(self) == 1) {
5009 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5010 if (*p != ch) {
5011 *p = ch;
5012 return 1;
5013 }
5014 else
5015 return 0;
5016 }
Tim Petersced69f82003-09-16 20:30:58 +00005017
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 e = p + PyUnicode_GET_SIZE(self);
5019 previous_is_cased = 0;
5020 for (; p < e; p++) {
5021 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005022
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023 if (previous_is_cased)
5024 *p = Py_UNICODE_TOLOWER(ch);
5025 else
5026 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005027
5028 if (Py_UNICODE_ISLOWER(ch) ||
5029 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 Py_UNICODE_ISTITLE(ch))
5031 previous_is_cased = 1;
5032 else
5033 previous_is_cased = 0;
5034 }
5035 return 1;
5036}
5037
Tim Peters8ce9f162004-08-27 01:49:32 +00005038PyObject *
5039PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040{
Tim Peters8ce9f162004-08-27 01:49:32 +00005041 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005042 const Py_UNICODE blank = ' ';
5043 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005044 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005045 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005046 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5047 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005048 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5049 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005050 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005051 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005052 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053
Tim Peters05eba1f2004-08-27 21:32:02 +00005054 fseq = PySequence_Fast(seq, "");
5055 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005056 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005057 }
5058
Tim Peters91879ab2004-08-27 22:35:44 +00005059 /* Grrrr. A codec may be invoked to convert str objects to
5060 * Unicode, and so it's possible to call back into Python code
5061 * during PyUnicode_FromObject(), and so it's possible for a sick
5062 * codec to change the size of fseq (if seq is a list). Therefore
5063 * we have to keep refetching the size -- can't assume seqlen
5064 * is invariant.
5065 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005066 seqlen = PySequence_Fast_GET_SIZE(fseq);
5067 /* If empty sequence, return u"". */
5068 if (seqlen == 0) {
5069 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5070 goto Done;
5071 }
5072 /* If singleton sequence with an exact Unicode, return that. */
5073 if (seqlen == 1) {
5074 item = PySequence_Fast_GET_ITEM(fseq, 0);
5075 if (PyUnicode_CheckExact(item)) {
5076 Py_INCREF(item);
5077 res = (PyUnicodeObject *)item;
5078 goto Done;
5079 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005080 }
5081
Tim Peters05eba1f2004-08-27 21:32:02 +00005082 /* At least two items to join, or one that isn't exact Unicode. */
5083 if (seqlen > 1) {
5084 /* Set up sep and seplen -- they're needed. */
5085 if (separator == NULL) {
5086 sep = &blank;
5087 seplen = 1;
5088 }
5089 else {
5090 internal_separator = PyUnicode_FromObject(separator);
5091 if (internal_separator == NULL)
5092 goto onError;
5093 sep = PyUnicode_AS_UNICODE(internal_separator);
5094 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005095 /* In case PyUnicode_FromObject() mutated seq. */
5096 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005097 }
5098 }
5099
5100 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005101 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005102 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005103 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005104 res_p = PyUnicode_AS_UNICODE(res);
5105 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005106
Tim Peters05eba1f2004-08-27 21:32:02 +00005107 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005108 Py_ssize_t itemlen;
5109 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005110
5111 item = PySequence_Fast_GET_ITEM(fseq, i);
5112 /* Convert item to Unicode. */
5113 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5114 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005115 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005116 " %.80s found",
5117 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005118 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005119 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005120 item = PyUnicode_FromObject(item);
5121 if (item == NULL)
5122 goto onError;
5123 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005124
Tim Peters91879ab2004-08-27 22:35:44 +00005125 /* In case PyUnicode_FromObject() mutated seq. */
5126 seqlen = PySequence_Fast_GET_SIZE(fseq);
5127
Tim Peters8ce9f162004-08-27 01:49:32 +00005128 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005130 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005131 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005132 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005133 if (i < seqlen - 1) {
5134 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005135 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005136 goto Overflow;
5137 }
5138 if (new_res_used > res_alloc) {
5139 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005140 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005141 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005142 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005143 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005144 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005145 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005146 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005148 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005149 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005151
5152 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005153 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005154 res_p += itemlen;
5155 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005156 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005157 res_p += seplen;
5158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005160 res_used = new_res_used;
5161 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005162
Tim Peters05eba1f2004-08-27 21:32:02 +00005163 /* Shrink res to match the used area; this probably can't fail,
5164 * but it's cheap to check.
5165 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005166 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005167 goto onError;
5168
5169 Done:
5170 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005171 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172 return (PyObject *)res;
5173
Tim Peters8ce9f162004-08-27 01:49:32 +00005174 Overflow:
5175 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005176 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005177 Py_DECREF(item);
5178 /* fall through */
5179
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005181 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005182 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005183 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 return NULL;
5185}
5186
Tim Petersced69f82003-09-16 20:30:58 +00005187static
5188PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005189 Py_ssize_t left,
5190 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 Py_UNICODE fill)
5192{
5193 PyUnicodeObject *u;
5194
5195 if (left < 0)
5196 left = 0;
5197 if (right < 0)
5198 right = 0;
5199
Tim Peters7a29bd52001-09-12 03:03:31 +00005200 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 Py_INCREF(self);
5202 return self;
5203 }
5204
5205 u = _PyUnicode_New(left + self->length + right);
5206 if (u) {
5207 if (left)
5208 Py_UNICODE_FILL(u->str, fill, left);
5209 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5210 if (right)
5211 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5212 }
5213
5214 return u;
5215}
5216
5217#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005218 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 if (!str) \
5220 goto onError; \
5221 if (PyList_Append(list, str)) { \
5222 Py_DECREF(str); \
5223 goto onError; \
5224 } \
5225 else \
5226 Py_DECREF(str);
5227
5228static
5229PyObject *split_whitespace(PyUnicodeObject *self,
5230 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005231 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005233 register Py_ssize_t i;
5234 register Py_ssize_t j;
5235 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 PyObject *str;
5237
5238 for (i = j = 0; i < len; ) {
5239 /* find a token */
5240 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5241 i++;
5242 j = i;
5243 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5244 i++;
5245 if (j < i) {
5246 if (maxcount-- <= 0)
5247 break;
5248 SPLIT_APPEND(self->str, j, i);
5249 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5250 i++;
5251 j = i;
5252 }
5253 }
5254 if (j < len) {
5255 SPLIT_APPEND(self->str, j, len);
5256 }
5257 return list;
5258
5259 onError:
5260 Py_DECREF(list);
5261 return NULL;
5262}
5263
5264PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005265 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 register Py_ssize_t i;
5268 register Py_ssize_t j;
5269 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 PyObject *list;
5271 PyObject *str;
5272 Py_UNICODE *data;
5273
5274 string = PyUnicode_FromObject(string);
5275 if (string == NULL)
5276 return NULL;
5277 data = PyUnicode_AS_UNICODE(string);
5278 len = PyUnicode_GET_SIZE(string);
5279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 list = PyList_New(0);
5281 if (!list)
5282 goto onError;
5283
5284 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005285 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005286
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005288 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290
5291 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005292 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 if (i < len) {
5294 if (data[i] == '\r' && i + 1 < len &&
5295 data[i+1] == '\n')
5296 i += 2;
5297 else
5298 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005299 if (keepends)
5300 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 }
Guido van Rossum86662912000-04-11 15:38:46 +00005302 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 j = i;
5304 }
5305 if (j < len) {
5306 SPLIT_APPEND(data, j, len);
5307 }
5308
5309 Py_DECREF(string);
5310 return list;
5311
5312 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005313 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 Py_DECREF(string);
5315 return NULL;
5316}
5317
Tim Petersced69f82003-09-16 20:30:58 +00005318static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319PyObject *split_char(PyUnicodeObject *self,
5320 PyObject *list,
5321 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005322 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005324 register Py_ssize_t i;
5325 register Py_ssize_t j;
5326 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 PyObject *str;
5328
5329 for (i = j = 0; i < len; ) {
5330 if (self->str[i] == ch) {
5331 if (maxcount-- <= 0)
5332 break;
5333 SPLIT_APPEND(self->str, j, i);
5334 i = j = i + 1;
5335 } else
5336 i++;
5337 }
5338 if (j <= len) {
5339 SPLIT_APPEND(self->str, j, len);
5340 }
5341 return list;
5342
5343 onError:
5344 Py_DECREF(list);
5345 return NULL;
5346}
5347
Tim Petersced69f82003-09-16 20:30:58 +00005348static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349PyObject *split_substring(PyUnicodeObject *self,
5350 PyObject *list,
5351 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 register Py_ssize_t i;
5355 register Py_ssize_t j;
5356 Py_ssize_t len = self->length;
5357 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 PyObject *str;
5359
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005360 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 if (Py_UNICODE_MATCH(self, i, substring)) {
5362 if (maxcount-- <= 0)
5363 break;
5364 SPLIT_APPEND(self->str, j, i);
5365 i = j = i + sublen;
5366 } else
5367 i++;
5368 }
5369 if (j <= len) {
5370 SPLIT_APPEND(self->str, j, len);
5371 }
5372 return list;
5373
5374 onError:
5375 Py_DECREF(list);
5376 return NULL;
5377}
5378
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005379static
5380PyObject *rsplit_whitespace(PyUnicodeObject *self,
5381 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005382 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005383{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005384 register Py_ssize_t i;
5385 register Py_ssize_t j;
5386 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005387 PyObject *str;
5388
5389 for (i = j = len - 1; i >= 0; ) {
5390 /* find a token */
5391 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5392 i--;
5393 j = i;
5394 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5395 i--;
5396 if (j > i) {
5397 if (maxcount-- <= 0)
5398 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005399 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005400 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5401 i--;
5402 j = i;
5403 }
5404 }
5405 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005406 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005407 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005408 if (PyList_Reverse(list) < 0)
5409 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005410 return list;
5411
5412 onError:
5413 Py_DECREF(list);
5414 return NULL;
5415}
5416
5417static
5418PyObject *rsplit_char(PyUnicodeObject *self,
5419 PyObject *list,
5420 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005421 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005422{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005423 register Py_ssize_t i;
5424 register Py_ssize_t j;
5425 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005426 PyObject *str;
5427
5428 for (i = j = len - 1; i >= 0; ) {
5429 if (self->str[i] == ch) {
5430 if (maxcount-- <= 0)
5431 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005432 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005433 j = i = i - 1;
5434 } else
5435 i--;
5436 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005437 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005438 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005439 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005440 if (PyList_Reverse(list) < 0)
5441 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005442 return list;
5443
5444 onError:
5445 Py_DECREF(list);
5446 return NULL;
5447}
5448
5449static
5450PyObject *rsplit_substring(PyUnicodeObject *self,
5451 PyObject *list,
5452 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005453 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005455 register Py_ssize_t i;
5456 register Py_ssize_t j;
5457 Py_ssize_t len = self->length;
5458 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005459 PyObject *str;
5460
5461 for (i = len - sublen, j = len; i >= 0; ) {
5462 if (Py_UNICODE_MATCH(self, i, substring)) {
5463 if (maxcount-- <= 0)
5464 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005465 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005466 j = i;
5467 i -= sublen;
5468 } else
5469 i--;
5470 }
5471 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005472 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005473 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005474 if (PyList_Reverse(list) < 0)
5475 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005476 return list;
5477
5478 onError:
5479 Py_DECREF(list);
5480 return NULL;
5481}
5482
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483#undef SPLIT_APPEND
5484
5485static
5486PyObject *split(PyUnicodeObject *self,
5487 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005488 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489{
5490 PyObject *list;
5491
5492 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005493 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494
5495 list = PyList_New(0);
5496 if (!list)
5497 return NULL;
5498
5499 if (substring == NULL)
5500 return split_whitespace(self,list,maxcount);
5501
5502 else if (substring->length == 1)
5503 return split_char(self,list,substring->str[0],maxcount);
5504
5505 else if (substring->length == 0) {
5506 Py_DECREF(list);
5507 PyErr_SetString(PyExc_ValueError, "empty separator");
5508 return NULL;
5509 }
5510 else
5511 return split_substring(self,list,substring,maxcount);
5512}
5513
Tim Petersced69f82003-09-16 20:30:58 +00005514static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005515PyObject *rsplit(PyUnicodeObject *self,
5516 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005517 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005518{
5519 PyObject *list;
5520
5521 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005522 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005523
5524 list = PyList_New(0);
5525 if (!list)
5526 return NULL;
5527
5528 if (substring == NULL)
5529 return rsplit_whitespace(self,list,maxcount);
5530
5531 else if (substring->length == 1)
5532 return rsplit_char(self,list,substring->str[0],maxcount);
5533
5534 else if (substring->length == 0) {
5535 Py_DECREF(list);
5536 PyErr_SetString(PyExc_ValueError, "empty separator");
5537 return NULL;
5538 }
5539 else
5540 return rsplit_substring(self,list,substring,maxcount);
5541}
5542
5543static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544PyObject *replace(PyUnicodeObject *self,
5545 PyUnicodeObject *str1,
5546 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005547 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548{
5549 PyUnicodeObject *u;
5550
5551 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005552 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553
Thomas Wouters477c8d52006-05-27 19:21:47 +00005554 if (str1->length == str2->length) {
5555 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005556 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005557 if (str1->length == 1) {
5558 /* replace characters */
5559 Py_UNICODE u1, u2;
5560 if (!findchar(self->str, self->length, str1->str[0]))
5561 goto nothing;
5562 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5563 if (!u)
5564 return NULL;
5565 Py_UNICODE_COPY(u->str, self->str, self->length);
5566 u1 = str1->str[0];
5567 u2 = str2->str[0];
5568 for (i = 0; i < u->length; i++)
5569 if (u->str[i] == u1) {
5570 if (--maxcount < 0)
5571 break;
5572 u->str[i] = u2;
5573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005575 i = fastsearch(
5576 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005578 if (i < 0)
5579 goto nothing;
5580 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5581 if (!u)
5582 return NULL;
5583 Py_UNICODE_COPY(u->str, self->str, self->length);
5584 while (i <= self->length - str1->length)
5585 if (Py_UNICODE_MATCH(self, i, str1)) {
5586 if (--maxcount < 0)
5587 break;
5588 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5589 i += str1->length;
5590 } else
5591 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005594
5595 Py_ssize_t n, i, j, e;
5596 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 Py_UNICODE *p;
5598
5599 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005600 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 if (n > maxcount)
5602 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005603 if (n == 0)
5604 goto nothing;
5605 /* new_size = self->length + n * (str2->length - str1->length)); */
5606 delta = (str2->length - str1->length);
5607 if (delta == 0) {
5608 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005610 product = n * (str2->length - str1->length);
5611 if ((product / (str2->length - str1->length)) != n) {
5612 PyErr_SetString(PyExc_OverflowError,
5613 "replace string is too long");
5614 return NULL;
5615 }
5616 new_size = self->length + product;
5617 if (new_size < 0) {
5618 PyErr_SetString(PyExc_OverflowError,
5619 "replace string is too long");
5620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 }
5622 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005623 u = _PyUnicode_New(new_size);
5624 if (!u)
5625 return NULL;
5626 i = 0;
5627 p = u->str;
5628 e = self->length - str1->length;
5629 if (str1->length > 0) {
5630 while (n-- > 0) {
5631 /* look for next match */
5632 j = i;
5633 while (j <= e) {
5634 if (Py_UNICODE_MATCH(self, j, str1))
5635 break;
5636 j++;
5637 }
5638 if (j > i) {
5639 if (j > e)
5640 break;
5641 /* copy unchanged part [i:j] */
5642 Py_UNICODE_COPY(p, self->str+i, j-i);
5643 p += j - i;
5644 }
5645 /* copy substitution string */
5646 if (str2->length > 0) {
5647 Py_UNICODE_COPY(p, str2->str, str2->length);
5648 p += str2->length;
5649 }
5650 i = j + str1->length;
5651 }
5652 if (i < self->length)
5653 /* copy tail [i:] */
5654 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5655 } else {
5656 /* interleave */
5657 while (n > 0) {
5658 Py_UNICODE_COPY(p, str2->str, str2->length);
5659 p += str2->length;
5660 if (--n <= 0)
5661 break;
5662 *p++ = self->str[i++];
5663 }
5664 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005668
5669nothing:
5670 /* nothing to replace; return original string (when possible) */
5671 if (PyUnicode_CheckExact(self)) {
5672 Py_INCREF(self);
5673 return (PyObject *) self;
5674 }
5675 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676}
5677
5678/* --- Unicode Object Methods --------------------------------------------- */
5679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005680PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681"S.title() -> unicode\n\
5682\n\
5683Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005684characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685
5686static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005687unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 return fixup(self, fixtitle);
5690}
5691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005692PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693"S.capitalize() -> unicode\n\
5694\n\
5695Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005696have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697
5698static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005699unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 return fixup(self, fixcapitalize);
5702}
5703
5704#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005705PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706"S.capwords() -> unicode\n\
5707\n\
5708Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005709normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710
5711static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005712unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713{
5714 PyObject *list;
5715 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005716 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 /* Split into words */
5719 list = split(self, NULL, -1);
5720 if (!list)
5721 return NULL;
5722
5723 /* Capitalize each word */
5724 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5725 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5726 fixcapitalize);
5727 if (item == NULL)
5728 goto onError;
5729 Py_DECREF(PyList_GET_ITEM(list, i));
5730 PyList_SET_ITEM(list, i, item);
5731 }
5732
5733 /* Join the words to form a new string */
5734 item = PyUnicode_Join(NULL, list);
5735
5736onError:
5737 Py_DECREF(list);
5738 return (PyObject *)item;
5739}
5740#endif
5741
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005742/* Argument converter. Coerces to a single unicode character */
5743
5744static int
5745convert_uc(PyObject *obj, void *addr)
5746{
5747 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5748 PyObject *uniobj;
5749 Py_UNICODE *unistr;
5750
5751 uniobj = PyUnicode_FromObject(obj);
5752 if (uniobj == NULL) {
5753 PyErr_SetString(PyExc_TypeError,
5754 "The fill character cannot be converted to Unicode");
5755 return 0;
5756 }
5757 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5758 PyErr_SetString(PyExc_TypeError,
5759 "The fill character must be exactly one character long");
5760 Py_DECREF(uniobj);
5761 return 0;
5762 }
5763 unistr = PyUnicode_AS_UNICODE(uniobj);
5764 *fillcharloc = unistr[0];
5765 Py_DECREF(uniobj);
5766 return 1;
5767}
5768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005769PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005770"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005772Return S centered in a Unicode string of length width. Padding is\n\
5773done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
5775static PyObject *
5776unicode_center(PyUnicodeObject *self, PyObject *args)
5777{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005778 Py_ssize_t marg, left;
5779 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005780 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781
Thomas Woutersde017742006-02-16 19:34:37 +00005782 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 return NULL;
5784
Tim Peters7a29bd52001-09-12 03:03:31 +00005785 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 Py_INCREF(self);
5787 return (PyObject*) self;
5788 }
5789
5790 marg = width - self->length;
5791 left = marg / 2 + (marg & width & 1);
5792
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005793 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794}
5795
Marc-André Lemburge5034372000-08-08 08:04:29 +00005796#if 0
5797
5798/* This code should go into some future Unicode collation support
5799 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005800 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005801
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005802/* speedy UTF-16 code point order comparison */
5803/* gleaned from: */
5804/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5805
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005806static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005807{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005808 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005809 0, 0, 0, 0, 0, 0, 0, 0,
5810 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005811 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005812};
5813
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814static int
5815unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5816{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005817 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005818
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 Py_UNICODE *s1 = str1->str;
5820 Py_UNICODE *s2 = str2->str;
5821
5822 len1 = str1->length;
5823 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005824
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005826 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005827
5828 c1 = *s1++;
5829 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005830
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005831 if (c1 > (1<<11) * 26)
5832 c1 += utf16Fixup[c1>>11];
5833 if (c2 > (1<<11) * 26)
5834 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005835 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005836
5837 if (c1 != c2)
5838 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005839
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005840 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 }
5842
5843 return (len1 < len2) ? -1 : (len1 != len2);
5844}
5845
Marc-André Lemburge5034372000-08-08 08:04:29 +00005846#else
5847
5848static int
5849unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5850{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005851 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005852
5853 Py_UNICODE *s1 = str1->str;
5854 Py_UNICODE *s2 = str2->str;
5855
5856 len1 = str1->length;
5857 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005858
Marc-André Lemburge5034372000-08-08 08:04:29 +00005859 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005860 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005861
Fredrik Lundh45714e92001-06-26 16:39:36 +00005862 c1 = *s1++;
5863 c2 = *s2++;
5864
5865 if (c1 != c2)
5866 return (c1 < c2) ? -1 : 1;
5867
Marc-André Lemburge5034372000-08-08 08:04:29 +00005868 len1--; len2--;
5869 }
5870
5871 return (len1 < len2) ? -1 : (len1 != len2);
5872}
5873
5874#endif
5875
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876int PyUnicode_Compare(PyObject *left,
5877 PyObject *right)
5878{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005879 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5880 return unicode_compare((PyUnicodeObject *)left,
5881 (PyUnicodeObject *)right);
5882 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5883 (PyUnicode_Check(left) && PyString_Check(right))) {
5884 if (PyUnicode_Check(left))
5885 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5886 if (PyUnicode_Check(right))
5887 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5888 assert(PyString_Check(left));
5889 assert(PyString_Check(right));
5890 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005892 PyErr_Format(PyExc_TypeError,
5893 "Can't compare %.100s and %.100s",
5894 left->ob_type->tp_name,
5895 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 return -1;
5897}
5898
Martin v. Löwis5b222132007-06-10 09:51:05 +00005899int
5900PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5901{
5902 int i;
5903 Py_UNICODE *id;
5904 assert(PyUnicode_Check(uni));
5905 id = PyUnicode_AS_UNICODE(uni);
5906 /* Compare Unicode string and source character set string */
5907 for (i = 0; id[i] && str[i]; i++)
5908 if (id[i] != str[i])
5909 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5910 if (id[i])
5911 return 1; /* uni is longer */
5912 if (str[i])
5913 return -1; /* str is longer */
5914 return 0;
5915}
5916
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005917PyObject *PyUnicode_RichCompare(PyObject *left,
5918 PyObject *right,
5919 int op)
5920{
5921 int result;
5922
5923 result = PyUnicode_Compare(left, right);
5924 if (result == -1 && PyErr_Occurred())
5925 goto onError;
5926
5927 /* Convert the return value to a Boolean */
5928 switch (op) {
5929 case Py_EQ:
5930 result = (result == 0);
5931 break;
5932 case Py_NE:
5933 result = (result != 0);
5934 break;
5935 case Py_LE:
5936 result = (result <= 0);
5937 break;
5938 case Py_GE:
5939 result = (result >= 0);
5940 break;
5941 case Py_LT:
5942 result = (result == -1);
5943 break;
5944 case Py_GT:
5945 result = (result == 1);
5946 break;
5947 }
5948 return PyBool_FromLong(result);
5949
5950 onError:
5951
5952 /* Standard case
5953
5954 Type errors mean that PyUnicode_FromObject() could not convert
5955 one of the arguments (usually the right hand side) to Unicode,
5956 ie. we can't handle the comparison request. However, it is
5957 possible that the other object knows a comparison method, which
5958 is why we return Py_NotImplemented to give the other object a
5959 chance.
5960
5961 */
5962 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5963 PyErr_Clear();
5964 Py_INCREF(Py_NotImplemented);
5965 return Py_NotImplemented;
5966 }
5967 if (op != Py_EQ && op != Py_NE)
5968 return NULL;
5969
5970 /* Equality comparison.
5971
5972 This is a special case: we silence any PyExc_UnicodeDecodeError
5973 and instead turn it into a PyErr_UnicodeWarning.
5974
5975 */
5976 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5977 return NULL;
5978 PyErr_Clear();
5979 if (PyErr_Warn(PyExc_UnicodeWarning,
5980 (op == Py_EQ) ?
5981 "Unicode equal comparison "
5982 "failed to convert both arguments to Unicode - "
5983 "interpreting them as being unequal" :
5984 "Unicode unequal comparison "
5985 "failed to convert both arguments to Unicode - "
5986 "interpreting them as being unequal"
5987 ) < 0)
5988 return NULL;
5989 result = (op == Py_NE);
5990 return PyBool_FromLong(result);
5991}
5992
Guido van Rossum403d68b2000-03-13 15:55:09 +00005993int PyUnicode_Contains(PyObject *container,
5994 PyObject *element)
5995{
Thomas Wouters477c8d52006-05-27 19:21:47 +00005996 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005997 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005998
5999 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006000 sub = PyUnicode_FromObject(element);
6001 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006002 PyErr_Format(PyExc_TypeError,
6003 "'in <string>' requires string as left operand, not %s",
6004 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006005 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006006 }
6007
Thomas Wouters477c8d52006-05-27 19:21:47 +00006008 str = PyUnicode_FromObject(container);
6009 if (!str) {
6010 Py_DECREF(sub);
6011 return -1;
6012 }
6013
6014 result = stringlib_contains_obj(str, sub);
6015
6016 Py_DECREF(str);
6017 Py_DECREF(sub);
6018
Guido van Rossum403d68b2000-03-13 15:55:09 +00006019 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006020}
6021
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022/* Concat to string or Unicode object giving a new Unicode object. */
6023
6024PyObject *PyUnicode_Concat(PyObject *left,
6025 PyObject *right)
6026{
6027 PyUnicodeObject *u = NULL, *v = NULL, *w;
6028
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006029 if (PyBytes_Check(left) || PyBytes_Check(right))
6030 return PyBytes_Concat(left, right);
6031
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 /* Coerce the two arguments */
6033 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6034 if (u == NULL)
6035 goto onError;
6036 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6037 if (v == NULL)
6038 goto onError;
6039
6040 /* Shortcuts */
6041 if (v == unicode_empty) {
6042 Py_DECREF(v);
6043 return (PyObject *)u;
6044 }
6045 if (u == unicode_empty) {
6046 Py_DECREF(u);
6047 return (PyObject *)v;
6048 }
6049
6050 /* Concat the two Unicode strings */
6051 w = _PyUnicode_New(u->length + v->length);
6052 if (w == NULL)
6053 goto onError;
6054 Py_UNICODE_COPY(w->str, u->str, u->length);
6055 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6056
6057 Py_DECREF(u);
6058 Py_DECREF(v);
6059 return (PyObject *)w;
6060
6061onError:
6062 Py_XDECREF(u);
6063 Py_XDECREF(v);
6064 return NULL;
6065}
6066
Walter Dörwald1ab83302007-05-18 17:15:44 +00006067void
6068PyUnicode_Append(PyObject **pleft, PyObject *right)
6069{
6070 PyObject *new;
6071 if (*pleft == NULL)
6072 return;
6073 if (right == NULL || !PyUnicode_Check(*pleft)) {
6074 Py_DECREF(*pleft);
6075 *pleft = NULL;
6076 return;
6077 }
6078 new = PyUnicode_Concat(*pleft, right);
6079 Py_DECREF(*pleft);
6080 *pleft = new;
6081}
6082
6083void
6084PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6085{
6086 PyUnicode_Append(pleft, right);
6087 Py_XDECREF(right);
6088}
6089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006090PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091"S.count(sub[, start[, end]]) -> int\n\
6092\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006093Return the number of non-overlapping occurrences of substring sub in\n\
6094Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006095interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096
6097static PyObject *
6098unicode_count(PyUnicodeObject *self, PyObject *args)
6099{
6100 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006101 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006102 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 PyObject *result;
6104
Guido van Rossumb8872e62000-05-09 14:14:27 +00006105 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6106 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 return NULL;
6108
6109 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006110 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 if (substring == NULL)
6112 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006113
Thomas Wouters477c8d52006-05-27 19:21:47 +00006114 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115
Thomas Wouters477c8d52006-05-27 19:21:47 +00006116 result = PyInt_FromSsize_t(
6117 stringlib_count(self->str + start, end - start,
6118 substring->str, substring->length)
6119 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120
6121 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006122
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 return result;
6124}
6125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006126PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006127"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006129Encodes S using the codec registered for encoding. encoding defaults\n\
6130to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006131handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6133'xmlcharrefreplace' as well as any other name registered with\n\
6134codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
6136static PyObject *
6137unicode_encode(PyUnicodeObject *self, PyObject *args)
6138{
6139 char *encoding = NULL;
6140 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006141 PyObject *v;
6142
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6144 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006145 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006146 if (v == NULL)
6147 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006148 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006149 if (PyString_Check(v)) {
6150 /* Old codec, turn it into bytes */
6151 PyObject *b = PyBytes_FromObject(v);
6152 Py_DECREF(v);
6153 return b;
6154 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006155 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006156 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006157 "(type=%.400s)",
6158 v->ob_type->tp_name);
6159 Py_DECREF(v);
6160 return NULL;
6161 }
6162 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006163
6164 onError:
6165 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006166}
6167
6168PyDoc_STRVAR(decode__doc__,
6169"S.decode([encoding[,errors]]) -> string or unicode\n\
6170\n\
6171Decodes S using the codec registered for encoding. encoding defaults\n\
6172to the default encoding. errors may be given to set a different error\n\
6173handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6174a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6175as well as any other name registerd with codecs.register_error that is\n\
6176able to handle UnicodeDecodeErrors.");
6177
6178static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006179unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006180{
6181 char *encoding = NULL;
6182 char *errors = NULL;
6183 PyObject *v;
6184
6185 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6186 return NULL;
6187 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006188 if (v == NULL)
6189 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006190 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6191 PyErr_Format(PyExc_TypeError,
6192 "decoder did not return a string/unicode object "
6193 "(type=%.400s)",
6194 v->ob_type->tp_name);
6195 Py_DECREF(v);
6196 return NULL;
6197 }
6198 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006199
6200 onError:
6201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202}
6203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006204PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205"S.expandtabs([tabsize]) -> unicode\n\
6206\n\
6207Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006208If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209
6210static PyObject*
6211unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6212{
6213 Py_UNICODE *e;
6214 Py_UNICODE *p;
6215 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006216 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 PyUnicodeObject *u;
6218 int tabsize = 8;
6219
6220 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6221 return NULL;
6222
Thomas Wouters7e474022000-07-16 12:04:32 +00006223 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006224 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 e = self->str + self->length;
6226 for (p = self->str; p < e; p++)
6227 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006228 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006230 if (old_j > j) {
6231 PyErr_SetString(PyExc_OverflowError,
6232 "new string is too long");
6233 return NULL;
6234 }
6235 old_j = j;
6236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 }
6238 else {
6239 j++;
6240 if (*p == '\n' || *p == '\r') {
6241 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006242 old_j = j = 0;
6243 if (i < 0) {
6244 PyErr_SetString(PyExc_OverflowError,
6245 "new string is too long");
6246 return NULL;
6247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 }
6249 }
6250
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006251 if ((i + j) < 0) {
6252 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6253 return NULL;
6254 }
6255
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 /* Second pass: create output string and fill it */
6257 u = _PyUnicode_New(i + j);
6258 if (!u)
6259 return NULL;
6260
6261 j = 0;
6262 q = u->str;
6263
6264 for (p = self->str; p < e; p++)
6265 if (*p == '\t') {
6266 if (tabsize > 0) {
6267 i = tabsize - (j % tabsize);
6268 j += i;
6269 while (i--)
6270 *q++ = ' ';
6271 }
6272 }
6273 else {
6274 j++;
6275 *q++ = *p;
6276 if (*p == '\n' || *p == '\r')
6277 j = 0;
6278 }
6279
6280 return (PyObject*) u;
6281}
6282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006283PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284"S.find(sub [,start [,end]]) -> int\n\
6285\n\
6286Return the lowest index in S where substring sub is found,\n\
6287such that sub is contained within s[start,end]. Optional\n\
6288arguments start and end are interpreted as in slice notation.\n\
6289\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006290Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291
6292static PyObject *
6293unicode_find(PyUnicodeObject *self, PyObject *args)
6294{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006295 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006296 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006297 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006298 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299
Guido van Rossumb8872e62000-05-09 14:14:27 +00006300 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6301 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006303 substring = PyUnicode_FromObject(substring);
6304 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 return NULL;
6306
Thomas Wouters477c8d52006-05-27 19:21:47 +00006307 result = stringlib_find_slice(
6308 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6309 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6310 start, end
6311 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
6313 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006314
6315 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316}
6317
6318static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006319unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320{
6321 if (index < 0 || index >= self->length) {
6322 PyErr_SetString(PyExc_IndexError, "string index out of range");
6323 return NULL;
6324 }
6325
6326 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6327}
6328
6329static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006330unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006332 /* Since Unicode objects compare equal to their UTF-8 string
6333 counterparts, we hash the UTF-8 string. */
6334 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6335 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336}
6337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006338PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339"S.index(sub [,start [,end]]) -> int\n\
6340\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006341Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
6343static PyObject *
6344unicode_index(PyUnicodeObject *self, PyObject *args)
6345{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006346 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006347 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006348 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006349 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350
Guido van Rossumb8872e62000-05-09 14:14:27 +00006351 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6352 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006354 substring = PyUnicode_FromObject(substring);
6355 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 return NULL;
6357
Thomas Wouters477c8d52006-05-27 19:21:47 +00006358 result = stringlib_find_slice(
6359 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6360 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6361 start, end
6362 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006365
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 if (result < 0) {
6367 PyErr_SetString(PyExc_ValueError, "substring not found");
6368 return NULL;
6369 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006370
Martin v. Löwis18e16552006-02-15 17:27:45 +00006371 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372}
6373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006374PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006375"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006377Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006378at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379
6380static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006381unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382{
6383 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6384 register const Py_UNICODE *e;
6385 int cased;
6386
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 /* Shortcut for single character strings */
6388 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006389 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006391 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006392 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006393 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006394
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 e = p + PyUnicode_GET_SIZE(self);
6396 cased = 0;
6397 for (; p < e; p++) {
6398 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006399
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006401 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 else if (!cased && Py_UNICODE_ISLOWER(ch))
6403 cased = 1;
6404 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006405 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406}
6407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006408PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006409"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006411Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006412at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
6414static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006415unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416{
6417 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6418 register const Py_UNICODE *e;
6419 int cased;
6420
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 /* Shortcut for single character strings */
6422 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006423 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006425 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006426 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006427 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006428
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 e = p + PyUnicode_GET_SIZE(self);
6430 cased = 0;
6431 for (; p < e; p++) {
6432 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006433
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006435 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 else if (!cased && Py_UNICODE_ISUPPER(ch))
6437 cased = 1;
6438 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006439 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440}
6441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006442PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006443"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006445Return True if S is a titlecased string and there is at least one\n\
6446character in S, i.e. upper- and titlecase characters may only\n\
6447follow uncased characters and lowercase characters only cased ones.\n\
6448Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449
6450static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006451unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452{
6453 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6454 register const Py_UNICODE *e;
6455 int cased, previous_is_cased;
6456
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 /* Shortcut for single character strings */
6458 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006459 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6460 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006462 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006463 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006464 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006465
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 e = p + PyUnicode_GET_SIZE(self);
6467 cased = 0;
6468 previous_is_cased = 0;
6469 for (; p < e; p++) {
6470 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006471
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6473 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006474 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 previous_is_cased = 1;
6476 cased = 1;
6477 }
6478 else if (Py_UNICODE_ISLOWER(ch)) {
6479 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006480 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 previous_is_cased = 1;
6482 cased = 1;
6483 }
6484 else
6485 previous_is_cased = 0;
6486 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006487 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488}
6489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006490PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006491"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006493Return True if all characters in S are whitespace\n\
6494and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495
6496static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006497unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498{
6499 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6500 register const Py_UNICODE *e;
6501
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 /* Shortcut for single character strings */
6503 if (PyUnicode_GET_SIZE(self) == 1 &&
6504 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006505 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006507 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006508 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006509 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006510
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 e = p + PyUnicode_GET_SIZE(self);
6512 for (; p < e; p++) {
6513 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006514 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006516 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517}
6518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006519PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006520"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006521\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006522Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006523and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006524
6525static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006526unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006527{
6528 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6529 register const Py_UNICODE *e;
6530
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006531 /* Shortcut for single character strings */
6532 if (PyUnicode_GET_SIZE(self) == 1 &&
6533 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006534 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006535
6536 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006537 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006538 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006539
6540 e = p + PyUnicode_GET_SIZE(self);
6541 for (; p < e; p++) {
6542 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006543 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006544 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006545 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006546}
6547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006548PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006549"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006550\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006551Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006552and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006553
6554static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006555unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006556{
6557 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6558 register const Py_UNICODE *e;
6559
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006560 /* Shortcut for single character strings */
6561 if (PyUnicode_GET_SIZE(self) == 1 &&
6562 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006563 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006564
6565 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006566 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006567 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006568
6569 e = p + PyUnicode_GET_SIZE(self);
6570 for (; p < e; p++) {
6571 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006572 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006573 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006574 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006575}
6576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006577PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006578"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006580Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006581False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582
6583static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006584unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585{
6586 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6587 register const Py_UNICODE *e;
6588
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 /* Shortcut for single character strings */
6590 if (PyUnicode_GET_SIZE(self) == 1 &&
6591 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006592 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006594 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006595 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006596 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006597
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 e = p + PyUnicode_GET_SIZE(self);
6599 for (; p < e; p++) {
6600 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006601 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006603 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604}
6605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006606PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006607"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006609Return True if all characters in S are digits\n\
6610and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
6612static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006613unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614{
6615 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6616 register const Py_UNICODE *e;
6617
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 /* Shortcut for single character strings */
6619 if (PyUnicode_GET_SIZE(self) == 1 &&
6620 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006621 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006623 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006624 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006625 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006626
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 e = p + PyUnicode_GET_SIZE(self);
6628 for (; p < e; p++) {
6629 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006630 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006632 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633}
6634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006635PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006636"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006638Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006639False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640
6641static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006642unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643{
6644 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6645 register const Py_UNICODE *e;
6646
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 /* Shortcut for single character strings */
6648 if (PyUnicode_GET_SIZE(self) == 1 &&
6649 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006650 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006652 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006653 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006654 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006655
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 e = p + PyUnicode_GET_SIZE(self);
6657 for (; p < e; p++) {
6658 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006659 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006661 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006664PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665"S.join(sequence) -> unicode\n\
6666\n\
6667Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006668sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669
6670static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006671unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006673 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674}
6675
Martin v. Löwis18e16552006-02-15 17:27:45 +00006676static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677unicode_length(PyUnicodeObject *self)
6678{
6679 return self->length;
6680}
6681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006682PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006683"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684\n\
6685Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006686done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
6688static PyObject *
6689unicode_ljust(PyUnicodeObject *self, PyObject *args)
6690{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006691 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006692 Py_UNICODE fillchar = ' ';
6693
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006694 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 return NULL;
6696
Tim Peters7a29bd52001-09-12 03:03:31 +00006697 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 Py_INCREF(self);
6699 return (PyObject*) self;
6700 }
6701
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006702 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703}
6704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006705PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706"S.lower() -> unicode\n\
6707\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006708Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709
6710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006711unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 return fixup(self, fixlower);
6714}
6715
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006716#define LEFTSTRIP 0
6717#define RIGHTSTRIP 1
6718#define BOTHSTRIP 2
6719
6720/* Arrays indexed by above */
6721static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6722
6723#define STRIPNAME(i) (stripformat[i]+3)
6724
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006725/* externally visible for str.strip(unicode) */
6726PyObject *
6727_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6728{
6729 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006730 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006731 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006732 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6733 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006734
Thomas Wouters477c8d52006-05-27 19:21:47 +00006735 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6736
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006737 i = 0;
6738 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006739 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6740 i++;
6741 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006742 }
6743
6744 j = len;
6745 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006746 do {
6747 j--;
6748 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6749 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006750 }
6751
6752 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006753 Py_INCREF(self);
6754 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006755 }
6756 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006757 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006758}
6759
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760
6761static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006762do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006764 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006765 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006766
6767 i = 0;
6768 if (striptype != RIGHTSTRIP) {
6769 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6770 i++;
6771 }
6772 }
6773
6774 j = len;
6775 if (striptype != LEFTSTRIP) {
6776 do {
6777 j--;
6778 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6779 j++;
6780 }
6781
6782 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6783 Py_INCREF(self);
6784 return (PyObject*)self;
6785 }
6786 else
6787 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788}
6789
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006790
6791static PyObject *
6792do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6793{
6794 PyObject *sep = NULL;
6795
6796 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6797 return NULL;
6798
6799 if (sep != NULL && sep != Py_None) {
6800 if (PyUnicode_Check(sep))
6801 return _PyUnicode_XStrip(self, striptype, sep);
6802 else if (PyString_Check(sep)) {
6803 PyObject *res;
6804 sep = PyUnicode_FromObject(sep);
6805 if (sep==NULL)
6806 return NULL;
6807 res = _PyUnicode_XStrip(self, striptype, sep);
6808 Py_DECREF(sep);
6809 return res;
6810 }
6811 else {
6812 PyErr_Format(PyExc_TypeError,
6813 "%s arg must be None, unicode or str",
6814 STRIPNAME(striptype));
6815 return NULL;
6816 }
6817 }
6818
6819 return do_strip(self, striptype);
6820}
6821
6822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006823PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006824"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006825\n\
6826Return a copy of the string S with leading and trailing\n\
6827whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006828If chars is given and not None, remove characters in chars instead.\n\
6829If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006830
6831static PyObject *
6832unicode_strip(PyUnicodeObject *self, PyObject *args)
6833{
6834 if (PyTuple_GET_SIZE(args) == 0)
6835 return do_strip(self, BOTHSTRIP); /* Common case */
6836 else
6837 return do_argstrip(self, BOTHSTRIP, args);
6838}
6839
6840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006841PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006842"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006843\n\
6844Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006845If chars is given and not None, remove characters in chars instead.\n\
6846If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006847
6848static PyObject *
6849unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6850{
6851 if (PyTuple_GET_SIZE(args) == 0)
6852 return do_strip(self, LEFTSTRIP); /* Common case */
6853 else
6854 return do_argstrip(self, LEFTSTRIP, args);
6855}
6856
6857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006858PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006859"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006860\n\
6861Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006862If chars is given and not None, remove characters in chars instead.\n\
6863If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006864
6865static PyObject *
6866unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6867{
6868 if (PyTuple_GET_SIZE(args) == 0)
6869 return do_strip(self, RIGHTSTRIP); /* Common case */
6870 else
6871 return do_argstrip(self, RIGHTSTRIP, args);
6872}
6873
6874
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006876unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877{
6878 PyUnicodeObject *u;
6879 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006880 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006881 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882
6883 if (len < 0)
6884 len = 0;
6885
Tim Peters7a29bd52001-09-12 03:03:31 +00006886 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 /* no repeat, return original string */
6888 Py_INCREF(str);
6889 return (PyObject*) str;
6890 }
Tim Peters8f422462000-09-09 06:13:41 +00006891
6892 /* ensure # of chars needed doesn't overflow int and # of bytes
6893 * needed doesn't overflow size_t
6894 */
6895 nchars = len * str->length;
6896 if (len && nchars / len != str->length) {
6897 PyErr_SetString(PyExc_OverflowError,
6898 "repeated string is too long");
6899 return NULL;
6900 }
6901 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6902 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6903 PyErr_SetString(PyExc_OverflowError,
6904 "repeated string is too long");
6905 return NULL;
6906 }
6907 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 if (!u)
6909 return NULL;
6910
6911 p = u->str;
6912
Thomas Wouters477c8d52006-05-27 19:21:47 +00006913 if (str->length == 1 && len > 0) {
6914 Py_UNICODE_FILL(p, str->str[0], len);
6915 } else {
6916 Py_ssize_t done = 0; /* number of characters copied this far */
6917 if (done < nchars) {
6918 Py_UNICODE_COPY(p, str->str, str->length);
6919 done = str->length;
6920 }
6921 while (done < nchars) {
6922 int n = (done <= nchars-done) ? done : nchars-done;
6923 Py_UNICODE_COPY(p+done, p, n);
6924 done += n;
6925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 }
6927
6928 return (PyObject*) u;
6929}
6930
6931PyObject *PyUnicode_Replace(PyObject *obj,
6932 PyObject *subobj,
6933 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006934 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935{
6936 PyObject *self;
6937 PyObject *str1;
6938 PyObject *str2;
6939 PyObject *result;
6940
6941 self = PyUnicode_FromObject(obj);
6942 if (self == NULL)
6943 return NULL;
6944 str1 = PyUnicode_FromObject(subobj);
6945 if (str1 == NULL) {
6946 Py_DECREF(self);
6947 return NULL;
6948 }
6949 str2 = PyUnicode_FromObject(replobj);
6950 if (str2 == NULL) {
6951 Py_DECREF(self);
6952 Py_DECREF(str1);
6953 return NULL;
6954 }
Tim Petersced69f82003-09-16 20:30:58 +00006955 result = replace((PyUnicodeObject *)self,
6956 (PyUnicodeObject *)str1,
6957 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 maxcount);
6959 Py_DECREF(self);
6960 Py_DECREF(str1);
6961 Py_DECREF(str2);
6962 return result;
6963}
6964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006965PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966"S.replace (old, new[, maxsplit]) -> unicode\n\
6967\n\
6968Return a copy of S with all occurrences of substring\n\
6969old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006970given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971
6972static PyObject*
6973unicode_replace(PyUnicodeObject *self, PyObject *args)
6974{
6975 PyUnicodeObject *str1;
6976 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006977 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 PyObject *result;
6979
Martin v. Löwis18e16552006-02-15 17:27:45 +00006980 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 return NULL;
6982 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6983 if (str1 == NULL)
6984 return NULL;
6985 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006986 if (str2 == NULL) {
6987 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990
6991 result = replace(self, str1, str2, maxcount);
6992
6993 Py_DECREF(str1);
6994 Py_DECREF(str2);
6995 return result;
6996}
6997
6998static
6999PyObject *unicode_repr(PyObject *unicode)
7000{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007001 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007002 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007003 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7004 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7005
7006 /* XXX(nnorwitz): rather than over-allocating, it would be
7007 better to choose a different scheme. Perhaps scan the
7008 first N-chars of the string and allocate based on that size.
7009 */
7010 /* Initial allocation is based on the longest-possible unichr
7011 escape.
7012
7013 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7014 unichr, so in this case it's the longest unichr escape. In
7015 narrow (UTF-16) builds this is five chars per source unichr
7016 since there are two unichrs in the surrogate pair, so in narrow
7017 (UTF-16) builds it's not the longest unichr escape.
7018
7019 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7020 so in the narrow (UTF-16) build case it's the longest unichr
7021 escape.
7022 */
7023
Walter Dörwald1ab83302007-05-18 17:15:44 +00007024 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007025 2 /* quotes */
7026#ifdef Py_UNICODE_WIDE
7027 + 10*size
7028#else
7029 + 6*size
7030#endif
7031 + 1);
7032 if (repr == NULL)
7033 return NULL;
7034
Walter Dörwald1ab83302007-05-18 17:15:44 +00007035 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007036
7037 /* Add quote */
7038 *p++ = (findchar(s, size, '\'') &&
7039 !findchar(s, size, '"')) ? '"' : '\'';
7040 while (size-- > 0) {
7041 Py_UNICODE ch = *s++;
7042
7043 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007044 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007045 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007046 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007047 continue;
7048 }
7049
7050#ifdef Py_UNICODE_WIDE
7051 /* Map 21-bit characters to '\U00xxxxxx' */
7052 else if (ch >= 0x10000) {
7053 *p++ = '\\';
7054 *p++ = 'U';
7055 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7056 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7057 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7058 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7059 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7060 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7061 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7062 *p++ = hexdigits[ch & 0x0000000F];
7063 continue;
7064 }
7065#else
7066 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7067 else if (ch >= 0xD800 && ch < 0xDC00) {
7068 Py_UNICODE ch2;
7069 Py_UCS4 ucs;
7070
7071 ch2 = *s++;
7072 size--;
7073 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7074 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7075 *p++ = '\\';
7076 *p++ = 'U';
7077 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7078 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7079 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7080 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7081 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7082 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7083 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7084 *p++ = hexdigits[ucs & 0x0000000F];
7085 continue;
7086 }
7087 /* Fall through: isolated surrogates are copied as-is */
7088 s--;
7089 size++;
7090 }
7091#endif
7092
7093 /* Map 16-bit characters to '\uxxxx' */
7094 if (ch >= 256) {
7095 *p++ = '\\';
7096 *p++ = 'u';
7097 *p++ = hexdigits[(ch >> 12) & 0x000F];
7098 *p++ = hexdigits[(ch >> 8) & 0x000F];
7099 *p++ = hexdigits[(ch >> 4) & 0x000F];
7100 *p++ = hexdigits[ch & 0x000F];
7101 }
7102
7103 /* Map special whitespace to '\t', \n', '\r' */
7104 else if (ch == '\t') {
7105 *p++ = '\\';
7106 *p++ = 't';
7107 }
7108 else if (ch == '\n') {
7109 *p++ = '\\';
7110 *p++ = 'n';
7111 }
7112 else if (ch == '\r') {
7113 *p++ = '\\';
7114 *p++ = 'r';
7115 }
7116
7117 /* Map non-printable US ASCII to '\xhh' */
7118 else if (ch < ' ' || ch >= 0x7F) {
7119 *p++ = '\\';
7120 *p++ = 'x';
7121 *p++ = hexdigits[(ch >> 4) & 0x000F];
7122 *p++ = hexdigits[ch & 0x000F];
7123 }
7124
7125 /* Copy everything else as-is */
7126 else
7127 *p++ = (char) ch;
7128 }
7129 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007130 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007131
7132 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007133 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007134 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135}
7136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007137PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138"S.rfind(sub [,start [,end]]) -> int\n\
7139\n\
7140Return the highest index in S where substring sub is found,\n\
7141such that sub is contained within s[start,end]. Optional\n\
7142arguments start and end are interpreted as in slice notation.\n\
7143\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145
7146static PyObject *
7147unicode_rfind(PyUnicodeObject *self, PyObject *args)
7148{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007149 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007150 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007151 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007152 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153
Guido van Rossumb8872e62000-05-09 14:14:27 +00007154 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7155 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007157 substring = PyUnicode_FromObject(substring);
7158 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 return NULL;
7160
Thomas Wouters477c8d52006-05-27 19:21:47 +00007161 result = stringlib_rfind_slice(
7162 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7163 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7164 start, end
7165 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166
7167 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007168
7169 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170}
7171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007172PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173"S.rindex(sub [,start [,end]]) -> int\n\
7174\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007175Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176
7177static PyObject *
7178unicode_rindex(PyUnicodeObject *self, PyObject *args)
7179{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007180 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007181 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007182 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007183 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184
Guido van Rossumb8872e62000-05-09 14:14:27 +00007185 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7186 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007188 substring = PyUnicode_FromObject(substring);
7189 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 return NULL;
7191
Thomas Wouters477c8d52006-05-27 19:21:47 +00007192 result = stringlib_rfind_slice(
7193 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7194 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7195 start, end
7196 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197
7198 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007199
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 if (result < 0) {
7201 PyErr_SetString(PyExc_ValueError, "substring not found");
7202 return NULL;
7203 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007204 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205}
7206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007207PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007208"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209\n\
7210Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007211done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212
7213static PyObject *
7214unicode_rjust(PyUnicodeObject *self, PyObject *args)
7215{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007216 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007217 Py_UNICODE fillchar = ' ';
7218
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007219 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 return NULL;
7221
Tim Peters7a29bd52001-09-12 03:03:31 +00007222 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 Py_INCREF(self);
7224 return (PyObject*) self;
7225 }
7226
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007227 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228}
7229
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007231unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232{
7233 /* standard clamping */
7234 if (start < 0)
7235 start = 0;
7236 if (end < 0)
7237 end = 0;
7238 if (end > self->length)
7239 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007240 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 /* full slice, return original string */
7242 Py_INCREF(self);
7243 return (PyObject*) self;
7244 }
7245 if (start > end)
7246 start = end;
7247 /* copy slice */
7248 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7249 end - start);
7250}
7251
7252PyObject *PyUnicode_Split(PyObject *s,
7253 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255{
7256 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007257
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 s = PyUnicode_FromObject(s);
7259 if (s == NULL)
7260 return NULL;
7261 if (sep != NULL) {
7262 sep = PyUnicode_FromObject(sep);
7263 if (sep == NULL) {
7264 Py_DECREF(s);
7265 return NULL;
7266 }
7267 }
7268
7269 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7270
7271 Py_DECREF(s);
7272 Py_XDECREF(sep);
7273 return result;
7274}
7275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007276PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277"S.split([sep [,maxsplit]]) -> list of strings\n\
7278\n\
7279Return a list of the words in S, using sep as the\n\
7280delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007281splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007282any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283
7284static PyObject*
7285unicode_split(PyUnicodeObject *self, PyObject *args)
7286{
7287 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007288 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289
Martin v. Löwis18e16552006-02-15 17:27:45 +00007290 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291 return NULL;
7292
7293 if (substring == Py_None)
7294 return split(self, NULL, maxcount);
7295 else if (PyUnicode_Check(substring))
7296 return split(self, (PyUnicodeObject *)substring, maxcount);
7297 else
7298 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7299}
7300
Thomas Wouters477c8d52006-05-27 19:21:47 +00007301PyObject *
7302PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7303{
7304 PyObject* str_obj;
7305 PyObject* sep_obj;
7306 PyObject* out;
7307
7308 str_obj = PyUnicode_FromObject(str_in);
7309 if (!str_obj)
7310 return NULL;
7311 sep_obj = PyUnicode_FromObject(sep_in);
7312 if (!sep_obj) {
7313 Py_DECREF(str_obj);
7314 return NULL;
7315 }
7316
7317 out = stringlib_partition(
7318 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7319 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7320 );
7321
7322 Py_DECREF(sep_obj);
7323 Py_DECREF(str_obj);
7324
7325 return out;
7326}
7327
7328
7329PyObject *
7330PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7331{
7332 PyObject* str_obj;
7333 PyObject* sep_obj;
7334 PyObject* out;
7335
7336 str_obj = PyUnicode_FromObject(str_in);
7337 if (!str_obj)
7338 return NULL;
7339 sep_obj = PyUnicode_FromObject(sep_in);
7340 if (!sep_obj) {
7341 Py_DECREF(str_obj);
7342 return NULL;
7343 }
7344
7345 out = stringlib_rpartition(
7346 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7347 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7348 );
7349
7350 Py_DECREF(sep_obj);
7351 Py_DECREF(str_obj);
7352
7353 return out;
7354}
7355
7356PyDoc_STRVAR(partition__doc__,
7357"S.partition(sep) -> (head, sep, tail)\n\
7358\n\
7359Searches for the separator sep in S, and returns the part before it,\n\
7360the separator itself, and the part after it. If the separator is not\n\
7361found, returns S and two empty strings.");
7362
7363static PyObject*
7364unicode_partition(PyUnicodeObject *self, PyObject *separator)
7365{
7366 return PyUnicode_Partition((PyObject *)self, separator);
7367}
7368
7369PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007370"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007371\n\
7372Searches for the separator sep in S, starting at the end of S, and returns\n\
7373the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007374separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007375
7376static PyObject*
7377unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7378{
7379 return PyUnicode_RPartition((PyObject *)self, separator);
7380}
7381
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007382PyObject *PyUnicode_RSplit(PyObject *s,
7383 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007384 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007385{
7386 PyObject *result;
7387
7388 s = PyUnicode_FromObject(s);
7389 if (s == NULL)
7390 return NULL;
7391 if (sep != NULL) {
7392 sep = PyUnicode_FromObject(sep);
7393 if (sep == NULL) {
7394 Py_DECREF(s);
7395 return NULL;
7396 }
7397 }
7398
7399 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7400
7401 Py_DECREF(s);
7402 Py_XDECREF(sep);
7403 return result;
7404}
7405
7406PyDoc_STRVAR(rsplit__doc__,
7407"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7408\n\
7409Return a list of the words in S, using sep as the\n\
7410delimiter string, starting at the end of the string and\n\
7411working to the front. If maxsplit is given, at most maxsplit\n\
7412splits are done. If sep is not specified, any whitespace string\n\
7413is a separator.");
7414
7415static PyObject*
7416unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7417{
7418 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007419 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007420
Martin v. Löwis18e16552006-02-15 17:27:45 +00007421 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007422 return NULL;
7423
7424 if (substring == Py_None)
7425 return rsplit(self, NULL, maxcount);
7426 else if (PyUnicode_Check(substring))
7427 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7428 else
7429 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7430}
7431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007432PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007433"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434\n\
7435Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007436Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007437is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438
7439static PyObject*
7440unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7441{
Guido van Rossum86662912000-04-11 15:38:46 +00007442 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443
Guido van Rossum86662912000-04-11 15:38:46 +00007444 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 return NULL;
7446
Guido van Rossum86662912000-04-11 15:38:46 +00007447 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448}
7449
7450static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007451PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452{
Walter Dörwald346737f2007-05-31 10:44:43 +00007453 if (PyUnicode_CheckExact(self)) {
7454 Py_INCREF(self);
7455 return self;
7456 } else
7457 /* Subtype -- return genuine unicode string with the same value. */
7458 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7459 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460}
7461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007462PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463"S.swapcase() -> unicode\n\
7464\n\
7465Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007466and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
7468static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007469unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 return fixup(self, fixswapcase);
7472}
7473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007474PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475"S.translate(table) -> unicode\n\
7476\n\
7477Return a copy of the string S, where all characters have been mapped\n\
7478through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007479Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7480Unmapped characters are left untouched. Characters mapped to None\n\
7481are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
7483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007484unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485{
Tim Petersced69f82003-09-16 20:30:58 +00007486 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007488 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 "ignore");
7490}
7491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493"S.upper() -> unicode\n\
7494\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007495Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496
7497static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007498unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 return fixup(self, fixupper);
7501}
7502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504"S.zfill(width) -> unicode\n\
7505\n\
7506Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007507of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508
7509static PyObject *
7510unicode_zfill(PyUnicodeObject *self, PyObject *args)
7511{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007512 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 PyUnicodeObject *u;
7514
Martin v. Löwis18e16552006-02-15 17:27:45 +00007515 Py_ssize_t width;
7516 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517 return NULL;
7518
7519 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007520 if (PyUnicode_CheckExact(self)) {
7521 Py_INCREF(self);
7522 return (PyObject*) self;
7523 }
7524 else
7525 return PyUnicode_FromUnicode(
7526 PyUnicode_AS_UNICODE(self),
7527 PyUnicode_GET_SIZE(self)
7528 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 }
7530
7531 fill = width - self->length;
7532
7533 u = pad(self, fill, 0, '0');
7534
Walter Dörwald068325e2002-04-15 13:36:47 +00007535 if (u == NULL)
7536 return NULL;
7537
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 if (u->str[fill] == '+' || u->str[fill] == '-') {
7539 /* move sign to beginning of string */
7540 u->str[0] = u->str[fill];
7541 u->str[fill] = '0';
7542 }
7543
7544 return (PyObject*) u;
7545}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
7547#if 0
7548static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007549unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 return PyInt_FromLong(unicode_freelist_size);
7552}
7553#endif
7554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007555PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007556"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007558Return True if S starts with the specified prefix, False otherwise.\n\
7559With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007560With optional end, stop comparing S at that position.\n\
7561prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562
7563static PyObject *
7564unicode_startswith(PyUnicodeObject *self,
7565 PyObject *args)
7566{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007567 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007569 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007570 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007571 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007573 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007574 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007576 if (PyTuple_Check(subobj)) {
7577 Py_ssize_t i;
7578 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7579 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7580 PyTuple_GET_ITEM(subobj, i));
7581 if (substring == NULL)
7582 return NULL;
7583 result = tailmatch(self, substring, start, end, -1);
7584 Py_DECREF(substring);
7585 if (result) {
7586 Py_RETURN_TRUE;
7587 }
7588 }
7589 /* nothing matched */
7590 Py_RETURN_FALSE;
7591 }
7592 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007594 return NULL;
7595 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007597 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598}
7599
7600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007601PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007602"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007604Return True if S ends with the specified suffix, False otherwise.\n\
7605With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007606With optional end, stop comparing S at that position.\n\
7607suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608
7609static PyObject *
7610unicode_endswith(PyUnicodeObject *self,
7611 PyObject *args)
7612{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007613 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007615 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007616 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007617 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007619 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7620 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007622 if (PyTuple_Check(subobj)) {
7623 Py_ssize_t i;
7624 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7625 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7626 PyTuple_GET_ITEM(subobj, i));
7627 if (substring == NULL)
7628 return NULL;
7629 result = tailmatch(self, substring, start, end, +1);
7630 Py_DECREF(substring);
7631 if (result) {
7632 Py_RETURN_TRUE;
7633 }
7634 }
7635 Py_RETURN_FALSE;
7636 }
7637 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007641 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007643 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644}
7645
7646
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007647
7648static PyObject *
7649unicode_getnewargs(PyUnicodeObject *v)
7650{
7651 return Py_BuildValue("(u#)", v->str, v->length);
7652}
7653
7654
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655static PyMethodDef unicode_methods[] = {
7656
7657 /* Order is according to common usage: often used methods should
7658 appear first, since lookup is done sequentially. */
7659
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007660 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7661 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7662 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007663 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007664 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7665 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7666 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7667 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7668 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7669 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7670 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007671 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007672 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7673 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7674 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007675 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007676 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007677/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7678 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7679 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7680 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007681 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007682 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007683 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007684 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007685 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7686 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7687 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7688 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7689 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7690 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7691 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7692 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7693 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7694 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7695 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7696 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7697 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7698 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007699 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007700#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007701 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702#endif
7703
7704#if 0
7705 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007706 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707#endif
7708
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007709 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 {NULL, NULL}
7711};
7712
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007713static PyObject *
7714unicode_mod(PyObject *v, PyObject *w)
7715{
7716 if (!PyUnicode_Check(v)) {
7717 Py_INCREF(Py_NotImplemented);
7718 return Py_NotImplemented;
7719 }
7720 return PyUnicode_Format(v, w);
7721}
7722
7723static PyNumberMethods unicode_as_number = {
7724 0, /*nb_add*/
7725 0, /*nb_subtract*/
7726 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007727 unicode_mod, /*nb_remainder*/
7728};
7729
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007731 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007732 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007733 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7734 (ssizeargfunc) unicode_getitem, /* sq_item */
7735 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736 0, /* sq_ass_item */
7737 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007738 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739};
7740
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007741static PyObject*
7742unicode_subscript(PyUnicodeObject* self, PyObject* item)
7743{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007744 if (PyIndex_Check(item)) {
7745 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007746 if (i == -1 && PyErr_Occurred())
7747 return NULL;
7748 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007749 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007750 return unicode_getitem(self, i);
7751 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007752 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007753 Py_UNICODE* source_buf;
7754 Py_UNICODE* result_buf;
7755 PyObject* result;
7756
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007757 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007758 &start, &stop, &step, &slicelength) < 0) {
7759 return NULL;
7760 }
7761
7762 if (slicelength <= 0) {
7763 return PyUnicode_FromUnicode(NULL, 0);
7764 } else {
7765 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007766 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7767 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007768
7769 if (result_buf == NULL)
7770 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007771
7772 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7773 result_buf[i] = source_buf[cur];
7774 }
Tim Petersced69f82003-09-16 20:30:58 +00007775
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007776 result = PyUnicode_FromUnicode(result_buf, slicelength);
7777 PyMem_FREE(result_buf);
7778 return result;
7779 }
7780 } else {
7781 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7782 return NULL;
7783 }
7784}
7785
7786static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007787 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007788 (binaryfunc)unicode_subscript, /* mp_subscript */
7789 (objobjargproc)0, /* mp_ass_subscript */
7790};
7791
Martin v. Löwis18e16552006-02-15 17:27:45 +00007792static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007794 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 const void **ptr)
7796{
7797 if (index != 0) {
7798 PyErr_SetString(PyExc_SystemError,
7799 "accessing non-existent unicode segment");
7800 return -1;
7801 }
7802 *ptr = (void *) self->str;
7803 return PyUnicode_GET_DATA_SIZE(self);
7804}
7805
Martin v. Löwis18e16552006-02-15 17:27:45 +00007806static Py_ssize_t
7807unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 const void **ptr)
7809{
7810 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007811 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 return -1;
7813}
7814
7815static int
7816unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007817 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818{
7819 if (lenp)
7820 *lenp = PyUnicode_GET_DATA_SIZE(self);
7821 return 1;
7822}
7823
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007824static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007826 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827 const void **ptr)
7828{
7829 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007830
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 if (index != 0) {
7832 PyErr_SetString(PyExc_SystemError,
7833 "accessing non-existent unicode segment");
7834 return -1;
7835 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007836 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 if (str == NULL)
7838 return -1;
7839 *ptr = (void *) PyString_AS_STRING(str);
7840 return PyString_GET_SIZE(str);
7841}
7842
7843/* Helpers for PyUnicode_Format() */
7844
7845static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007846getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007848 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 if (argidx < arglen) {
7850 (*p_argidx)++;
7851 if (arglen < 0)
7852 return args;
7853 else
7854 return PyTuple_GetItem(args, argidx);
7855 }
7856 PyErr_SetString(PyExc_TypeError,
7857 "not enough arguments for format string");
7858 return NULL;
7859}
7860
7861#define F_LJUST (1<<0)
7862#define F_SIGN (1<<1)
7863#define F_BLANK (1<<2)
7864#define F_ALT (1<<3)
7865#define F_ZERO (1<<4)
7866
Martin v. Löwis18e16552006-02-15 17:27:45 +00007867static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007868strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007870 register Py_ssize_t i;
7871 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 for (i = len - 1; i >= 0; i--)
7873 buffer[i] = (Py_UNICODE) charbuffer[i];
7874
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 return len;
7876}
7877
Neal Norwitzfc76d632006-01-10 06:03:13 +00007878static int
7879doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7880{
Tim Peters15231542006-02-16 01:08:01 +00007881 Py_ssize_t result;
7882
Neal Norwitzfc76d632006-01-10 06:03:13 +00007883 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007884 result = strtounicode(buffer, (char *)buffer);
7885 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007886}
7887
7888static int
7889longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7890{
Tim Peters15231542006-02-16 01:08:01 +00007891 Py_ssize_t result;
7892
Neal Norwitzfc76d632006-01-10 06:03:13 +00007893 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007894 result = strtounicode(buffer, (char *)buffer);
7895 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007896}
7897
Guido van Rossum078151d2002-08-11 04:24:12 +00007898/* XXX To save some code duplication, formatfloat/long/int could have been
7899 shared with stringobject.c, converting from 8-bit to Unicode after the
7900 formatting is done. */
7901
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902static int
7903formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007904 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 int flags,
7906 int prec,
7907 int type,
7908 PyObject *v)
7909{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007910 /* fmt = '%#.' + `prec` + `type`
7911 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 char fmt[20];
7913 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007914
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915 x = PyFloat_AsDouble(v);
7916 if (x == -1.0 && PyErr_Occurred())
7917 return -1;
7918 if (prec < 0)
7919 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7921 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007922 /* Worst case length calc to ensure no buffer overrun:
7923
7924 'g' formats:
7925 fmt = %#.<prec>g
7926 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7927 for any double rep.)
7928 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7929
7930 'f' formats:
7931 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7932 len = 1 + 50 + 1 + prec = 52 + prec
7933
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007934 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007935 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007936
7937 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00007938 if (((type == 'g' || type == 'G') &&
7939 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007940 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007941 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007942 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007943 return -1;
7944 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007945 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7946 (flags&F_ALT) ? "#" : "",
7947 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007948 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949}
7950
Tim Peters38fd5b62000-09-21 05:43:11 +00007951static PyObject*
7952formatlong(PyObject *val, int flags, int prec, int type)
7953{
7954 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007955 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00007956 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00007957 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007958
7959 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7960 if (!str)
7961 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007962 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00007963 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00007964 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007965}
7966
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967static int
7968formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007969 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 int flags,
7971 int prec,
7972 int type,
7973 PyObject *v)
7974{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007975 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007976 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7977 * + 1 + 1
7978 * = 24
7979 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007980 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007981 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 long x;
7983
7984 x = PyInt_AsLong(v);
7985 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007986 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007987 if (x < 0 && type == 'u') {
7988 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007989 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007990 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7991 sign = "-";
7992 else
7993 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007995 prec = 1;
7996
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007997 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7998 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007999 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008000 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008001 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008002 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008003 return -1;
8004 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008005
8006 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008007 (type == 'x' || type == 'X' || type == 'o')) {
8008 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008009 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008010 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008011 * - when 0 is being converted, the C standard leaves off
8012 * the '0x' or '0X', which is inconsistent with other
8013 * %#x/%#X conversions and inconsistent with Python's
8014 * hex() function
8015 * - there are platforms that violate the standard and
8016 * convert 0 with the '0x' or '0X'
8017 * (Metrowerks, Compaq Tru64)
8018 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008019 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008020 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008021 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008022 * We can achieve the desired consistency by inserting our
8023 * own '0x' or '0X' prefix, and substituting %x/%X in place
8024 * of %#x/%#X.
8025 *
8026 * Note that this is the same approach as used in
8027 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008028 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008029 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8030 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008031 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008032 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008033 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8034 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008035 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008036 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008037 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008038 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008039 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008040 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041}
8042
8043static int
8044formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008045 size_t buflen,
8046 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008048 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008049 if (PyUnicode_Check(v)) {
8050 if (PyUnicode_GET_SIZE(v) != 1)
8051 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008055 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008056 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008057 goto onError;
8058 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060
8061 else {
8062 /* Integer input truncated to a character */
8063 long x;
8064 x = PyInt_AsLong(v);
8065 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008066 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008067#ifdef Py_UNICODE_WIDE
8068 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008069 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008070 "%c arg not in range(0x110000) "
8071 "(wide Python build)");
8072 return -1;
8073 }
8074#else
8075 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008076 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008077 "%c arg not in range(0x10000) "
8078 "(narrow Python build)");
8079 return -1;
8080 }
8081#endif
8082 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 }
8084 buf[1] = '\0';
8085 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008086
8087 onError:
8088 PyErr_SetString(PyExc_TypeError,
8089 "%c requires int or char");
8090 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091}
8092
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008093/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8094
8095 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8096 chars are formatted. XXX This is a magic number. Each formatting
8097 routine does bounds checking to ensure no overflow, but a better
8098 solution may be to malloc a buffer of appropriate size for each
8099 format. For now, the current solution is sufficient.
8100*/
8101#define FORMATBUFLEN (size_t)120
8102
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103PyObject *PyUnicode_Format(PyObject *format,
8104 PyObject *args)
8105{
8106 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008107 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108 int args_owned = 0;
8109 PyUnicodeObject *result = NULL;
8110 PyObject *dict = NULL;
8111 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008112
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 if (format == NULL || args == NULL) {
8114 PyErr_BadInternalCall();
8115 return NULL;
8116 }
8117 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008118 if (uformat == NULL)
8119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 fmt = PyUnicode_AS_UNICODE(uformat);
8121 fmtcnt = PyUnicode_GET_SIZE(uformat);
8122
8123 reslen = rescnt = fmtcnt + 100;
8124 result = _PyUnicode_New(reslen);
8125 if (result == NULL)
8126 goto onError;
8127 res = PyUnicode_AS_UNICODE(result);
8128
8129 if (PyTuple_Check(args)) {
8130 arglen = PyTuple_Size(args);
8131 argidx = 0;
8132 }
8133 else {
8134 arglen = -1;
8135 argidx = -2;
8136 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008137 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
8138 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 dict = args;
8140
8141 while (--fmtcnt >= 0) {
8142 if (*fmt != '%') {
8143 if (--rescnt < 0) {
8144 rescnt = fmtcnt + 100;
8145 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008146 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008147 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8149 --rescnt;
8150 }
8151 *res++ = *fmt++;
8152 }
8153 else {
8154 /* Got a format specifier */
8155 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 Py_UNICODE c = '\0';
8159 Py_UNICODE fill;
8160 PyObject *v = NULL;
8161 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008162 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008164 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008165 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166
8167 fmt++;
8168 if (*fmt == '(') {
8169 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008170 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 PyObject *key;
8172 int pcount = 1;
8173
8174 if (dict == NULL) {
8175 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008176 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177 goto onError;
8178 }
8179 ++fmt;
8180 --fmtcnt;
8181 keystart = fmt;
8182 /* Skip over balanced parentheses */
8183 while (pcount > 0 && --fmtcnt >= 0) {
8184 if (*fmt == ')')
8185 --pcount;
8186 else if (*fmt == '(')
8187 ++pcount;
8188 fmt++;
8189 }
8190 keylen = fmt - keystart - 1;
8191 if (fmtcnt < 0 || pcount > 0) {
8192 PyErr_SetString(PyExc_ValueError,
8193 "incomplete format key");
8194 goto onError;
8195 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008196#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008197 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 then looked up since Python uses strings to hold
8199 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008200 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 key = PyUnicode_EncodeUTF8(keystart,
8202 keylen,
8203 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008204#else
8205 key = PyUnicode_FromUnicode(keystart, keylen);
8206#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207 if (key == NULL)
8208 goto onError;
8209 if (args_owned) {
8210 Py_DECREF(args);
8211 args_owned = 0;
8212 }
8213 args = PyObject_GetItem(dict, key);
8214 Py_DECREF(key);
8215 if (args == NULL) {
8216 goto onError;
8217 }
8218 args_owned = 1;
8219 arglen = -1;
8220 argidx = -2;
8221 }
8222 while (--fmtcnt >= 0) {
8223 switch (c = *fmt++) {
8224 case '-': flags |= F_LJUST; continue;
8225 case '+': flags |= F_SIGN; continue;
8226 case ' ': flags |= F_BLANK; continue;
8227 case '#': flags |= F_ALT; continue;
8228 case '0': flags |= F_ZERO; continue;
8229 }
8230 break;
8231 }
8232 if (c == '*') {
8233 v = getnextarg(args, arglen, &argidx);
8234 if (v == NULL)
8235 goto onError;
8236 if (!PyInt_Check(v)) {
8237 PyErr_SetString(PyExc_TypeError,
8238 "* wants int");
8239 goto onError;
8240 }
8241 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008242 if (width == -1 && PyErr_Occurred())
8243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 if (width < 0) {
8245 flags |= F_LJUST;
8246 width = -width;
8247 }
8248 if (--fmtcnt >= 0)
8249 c = *fmt++;
8250 }
8251 else if (c >= '0' && c <= '9') {
8252 width = c - '0';
8253 while (--fmtcnt >= 0) {
8254 c = *fmt++;
8255 if (c < '0' || c > '9')
8256 break;
8257 if ((width*10) / 10 != width) {
8258 PyErr_SetString(PyExc_ValueError,
8259 "width too big");
8260 goto onError;
8261 }
8262 width = width*10 + (c - '0');
8263 }
8264 }
8265 if (c == '.') {
8266 prec = 0;
8267 if (--fmtcnt >= 0)
8268 c = *fmt++;
8269 if (c == '*') {
8270 v = getnextarg(args, arglen, &argidx);
8271 if (v == NULL)
8272 goto onError;
8273 if (!PyInt_Check(v)) {
8274 PyErr_SetString(PyExc_TypeError,
8275 "* wants int");
8276 goto onError;
8277 }
8278 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008279 if (prec == -1 && PyErr_Occurred())
8280 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 if (prec < 0)
8282 prec = 0;
8283 if (--fmtcnt >= 0)
8284 c = *fmt++;
8285 }
8286 else if (c >= '0' && c <= '9') {
8287 prec = c - '0';
8288 while (--fmtcnt >= 0) {
8289 c = Py_CHARMASK(*fmt++);
8290 if (c < '0' || c > '9')
8291 break;
8292 if ((prec*10) / 10 != prec) {
8293 PyErr_SetString(PyExc_ValueError,
8294 "prec too big");
8295 goto onError;
8296 }
8297 prec = prec*10 + (c - '0');
8298 }
8299 }
8300 } /* prec */
8301 if (fmtcnt >= 0) {
8302 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 if (--fmtcnt >= 0)
8304 c = *fmt++;
8305 }
8306 }
8307 if (fmtcnt < 0) {
8308 PyErr_SetString(PyExc_ValueError,
8309 "incomplete format");
8310 goto onError;
8311 }
8312 if (c != '%') {
8313 v = getnextarg(args, arglen, &argidx);
8314 if (v == NULL)
8315 goto onError;
8316 }
8317 sign = 0;
8318 fill = ' ';
8319 switch (c) {
8320
8321 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008322 pbuf = formatbuf;
8323 /* presume that buffer length is at least 1 */
8324 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 len = 1;
8326 break;
8327
8328 case 's':
8329 case 'r':
8330 if (PyUnicode_Check(v) && c == 's') {
8331 temp = v;
8332 Py_INCREF(temp);
8333 }
8334 else {
8335 PyObject *unicode;
8336 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008337 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 else
8339 temp = PyObject_Repr(v);
8340 if (temp == NULL)
8341 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008342 if (PyUnicode_Check(temp))
8343 /* nothing to do */;
8344 else if (PyString_Check(temp)) {
8345 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008346 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008348 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008350 Py_DECREF(temp);
8351 temp = unicode;
8352 if (temp == NULL)
8353 goto onError;
8354 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008355 else {
8356 Py_DECREF(temp);
8357 PyErr_SetString(PyExc_TypeError,
8358 "%s argument has non-string str()");
8359 goto onError;
8360 }
8361 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008362 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 len = PyUnicode_GET_SIZE(temp);
8364 if (prec >= 0 && len > prec)
8365 len = prec;
8366 break;
8367
8368 case 'i':
8369 case 'd':
8370 case 'u':
8371 case 'o':
8372 case 'x':
8373 case 'X':
8374 if (c == 'i')
8375 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008376 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008377 temp = formatlong(v, flags, prec, c);
8378 if (!temp)
8379 goto onError;
8380 pbuf = PyUnicode_AS_UNICODE(temp);
8381 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008382 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008384 else {
8385 pbuf = formatbuf;
8386 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8387 flags, prec, c, v);
8388 if (len < 0)
8389 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008390 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008391 }
8392 if (flags & F_ZERO)
8393 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 break;
8395
8396 case 'e':
8397 case 'E':
8398 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008399 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 case 'g':
8401 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008402 if (c == 'F')
8403 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008404 pbuf = formatbuf;
8405 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8406 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 if (len < 0)
8408 goto onError;
8409 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008410 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 fill = '0';
8412 break;
8413
8414 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008415 pbuf = formatbuf;
8416 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 if (len < 0)
8418 goto onError;
8419 break;
8420
8421 default:
8422 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008423 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008424 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008425 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008426 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008427 (Py_ssize_t)(fmt - 1 -
8428 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 goto onError;
8430 }
8431 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008432 if (*pbuf == '-' || *pbuf == '+') {
8433 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 len--;
8435 }
8436 else if (flags & F_SIGN)
8437 sign = '+';
8438 else if (flags & F_BLANK)
8439 sign = ' ';
8440 else
8441 sign = 0;
8442 }
8443 if (width < len)
8444 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008445 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 reslen -= rescnt;
8447 rescnt = width + fmtcnt + 100;
8448 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008449 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008450 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008451 PyErr_NoMemory();
8452 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008453 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008454 if (_PyUnicode_Resize(&result, reslen) < 0) {
8455 Py_XDECREF(temp);
8456 goto onError;
8457 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 res = PyUnicode_AS_UNICODE(result)
8459 + reslen - rescnt;
8460 }
8461 if (sign) {
8462 if (fill != ' ')
8463 *res++ = sign;
8464 rescnt--;
8465 if (width > len)
8466 width--;
8467 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008468 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008469 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008470 assert(pbuf[1] == c);
8471 if (fill != ' ') {
8472 *res++ = *pbuf++;
8473 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008474 }
Tim Petersfff53252001-04-12 18:38:48 +00008475 rescnt -= 2;
8476 width -= 2;
8477 if (width < 0)
8478 width = 0;
8479 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481 if (width > len && !(flags & F_LJUST)) {
8482 do {
8483 --rescnt;
8484 *res++ = fill;
8485 } while (--width > len);
8486 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008487 if (fill == ' ') {
8488 if (sign)
8489 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008490 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008491 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008492 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008493 *res++ = *pbuf++;
8494 *res++ = *pbuf++;
8495 }
8496 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008497 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 res += len;
8499 rescnt -= len;
8500 while (--width >= len) {
8501 --rescnt;
8502 *res++ = ' ';
8503 }
8504 if (dict && (argidx < arglen) && c != '%') {
8505 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008506 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008507 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 goto onError;
8509 }
8510 Py_XDECREF(temp);
8511 } /* '%' */
8512 } /* until end */
8513 if (argidx < arglen && !dict) {
8514 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008515 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 goto onError;
8517 }
8518
Thomas Woutersa96affe2006-03-12 00:29:36 +00008519 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8520 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 if (args_owned) {
8522 Py_DECREF(args);
8523 }
8524 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525 return (PyObject *)result;
8526
8527 onError:
8528 Py_XDECREF(result);
8529 Py_DECREF(uformat);
8530 if (args_owned) {
8531 Py_DECREF(args);
8532 }
8533 return NULL;
8534}
8535
8536static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008537 (readbufferproc) unicode_buffer_getreadbuf,
8538 (writebufferproc) unicode_buffer_getwritebuf,
8539 (segcountproc) unicode_buffer_getsegcount,
8540 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541};
8542
Jeremy Hylton938ace62002-07-17 16:30:39 +00008543static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008544unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8545
Tim Peters6d6c1a32001-08-02 04:15:00 +00008546static PyObject *
8547unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8548{
8549 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008550 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008551 char *encoding = NULL;
8552 char *errors = NULL;
8553
Guido van Rossume023fe02001-08-30 03:12:59 +00008554 if (type != &PyUnicode_Type)
8555 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008556 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8557 kwlist, &x, &encoding, &errors))
8558 return NULL;
8559 if (x == NULL)
8560 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008561 if (encoding == NULL && errors == NULL)
8562 return PyObject_Unicode(x);
8563 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008564 return PyUnicode_FromEncodedObject(x, encoding, errors);
8565}
8566
Guido van Rossume023fe02001-08-30 03:12:59 +00008567static PyObject *
8568unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8569{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008570 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008571 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008572
8573 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8574 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8575 if (tmp == NULL)
8576 return NULL;
8577 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008578 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008579 if (pnew == NULL) {
8580 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008581 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008582 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008583 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8584 if (pnew->str == NULL) {
8585 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008586 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008587 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008588 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008589 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008590 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8591 pnew->length = n;
8592 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008593 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008594 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008595}
8596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008597PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008598"unicode(string [, encoding[, errors]]) -> object\n\
8599\n\
8600Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008601encoding defaults to the current default string encoding.\n\
8602errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008603
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008604static PyObject *unicode_iter(PyObject *seq);
8605
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606PyTypeObject PyUnicode_Type = {
8607 PyObject_HEAD_INIT(&PyType_Type)
8608 0, /* ob_size */
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008609 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 sizeof(PyUnicodeObject), /* tp_size */
8611 0, /* tp_itemsize */
8612 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008613 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008615 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008617 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008618 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008619 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008621 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 (hashfunc) unicode_hash, /* tp_hash*/
8623 0, /* tp_call*/
8624 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008625 PyObject_GenericGetAttr, /* tp_getattro */
8626 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008628 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8629 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008630 unicode_doc, /* tp_doc */
8631 0, /* tp_traverse */
8632 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008633 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008634 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008635 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008636 0, /* tp_iternext */
8637 unicode_methods, /* tp_methods */
8638 0, /* tp_members */
8639 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008640 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008641 0, /* tp_dict */
8642 0, /* tp_descr_get */
8643 0, /* tp_descr_set */
8644 0, /* tp_dictoffset */
8645 0, /* tp_init */
8646 0, /* tp_alloc */
8647 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008648 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649};
8650
8651/* Initialize the Unicode implementation */
8652
Thomas Wouters78890102000-07-22 19:25:51 +00008653void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008655 int i;
8656
Thomas Wouters477c8d52006-05-27 19:21:47 +00008657 /* XXX - move this array to unicodectype.c ? */
8658 Py_UNICODE linebreak[] = {
8659 0x000A, /* LINE FEED */
8660 0x000D, /* CARRIAGE RETURN */
8661 0x001C, /* FILE SEPARATOR */
8662 0x001D, /* GROUP SEPARATOR */
8663 0x001E, /* RECORD SEPARATOR */
8664 0x0085, /* NEXT LINE */
8665 0x2028, /* LINE SEPARATOR */
8666 0x2029, /* PARAGRAPH SEPARATOR */
8667 };
8668
Fred Drakee4315f52000-05-09 19:53:39 +00008669 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008670 unicode_freelist = NULL;
8671 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008673 if (!unicode_empty)
8674 return;
8675
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008676 for (i = 0; i < 256; i++)
8677 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008678 if (PyType_Ready(&PyUnicode_Type) < 0)
8679 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008680
8681 /* initialize the linebreak bloom filter */
8682 bloom_linebreak = make_bloom_mask(
8683 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8684 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008685
8686 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687}
8688
8689/* Finalize the Unicode implementation */
8690
8691void
Thomas Wouters78890102000-07-22 19:25:51 +00008692_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008694 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008695 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008697 Py_XDECREF(unicode_empty);
8698 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008699
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008700 for (i = 0; i < 256; i++) {
8701 if (unicode_latin1[i]) {
8702 Py_DECREF(unicode_latin1[i]);
8703 unicode_latin1[i] = NULL;
8704 }
8705 }
8706
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008707 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 PyUnicodeObject *v = u;
8709 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008710 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008711 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008712 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008713 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008715 unicode_freelist = NULL;
8716 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008718
Walter Dörwald16807132007-05-25 13:52:07 +00008719void
8720PyUnicode_InternInPlace(PyObject **p)
8721{
8722 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8723 PyObject *t;
8724 if (s == NULL || !PyUnicode_Check(s))
8725 Py_FatalError(
8726 "PyUnicode_InternInPlace: unicode strings only please!");
8727 /* If it's a subclass, we don't really know what putting
8728 it in the interned dict might do. */
8729 if (!PyUnicode_CheckExact(s))
8730 return;
8731 if (PyUnicode_CHECK_INTERNED(s))
8732 return;
8733 if (interned == NULL) {
8734 interned = PyDict_New();
8735 if (interned == NULL) {
8736 PyErr_Clear(); /* Don't leave an exception */
8737 return;
8738 }
8739 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008740 /* It might be that the GetItem call fails even
8741 though the key is present in the dictionary,
8742 namely when this happens during a stack overflow. */
8743 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008744 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008745 Py_END_ALLOW_RECURSION
8746
Walter Dörwald16807132007-05-25 13:52:07 +00008747 if (t) {
8748 Py_INCREF(t);
8749 Py_DECREF(*p);
8750 *p = t;
8751 return;
8752 }
8753
Martin v. Löwis5b222132007-06-10 09:51:05 +00008754 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008755 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8756 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008757 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008758 return;
8759 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008760 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008761 /* The two references in interned are not counted by refcnt.
8762 The deallocator will take care of this */
8763 s->ob_refcnt -= 2;
8764 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8765}
8766
8767void
8768PyUnicode_InternImmortal(PyObject **p)
8769{
8770 PyUnicode_InternInPlace(p);
8771 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8772 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8773 Py_INCREF(*p);
8774 }
8775}
8776
8777PyObject *
8778PyUnicode_InternFromString(const char *cp)
8779{
8780 PyObject *s = PyUnicode_FromString(cp);
8781 if (s == NULL)
8782 return NULL;
8783 PyUnicode_InternInPlace(&s);
8784 return s;
8785}
8786
8787void _Py_ReleaseInternedUnicodeStrings(void)
8788{
8789 PyObject *keys;
8790 PyUnicodeObject *s;
8791 Py_ssize_t i, n;
8792 Py_ssize_t immortal_size = 0, mortal_size = 0;
8793
8794 if (interned == NULL || !PyDict_Check(interned))
8795 return;
8796 keys = PyDict_Keys(interned);
8797 if (keys == NULL || !PyList_Check(keys)) {
8798 PyErr_Clear();
8799 return;
8800 }
8801
8802 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8803 detector, interned unicode strings are not forcibly deallocated;
8804 rather, we give them their stolen references back, and then clear
8805 and DECREF the interned dict. */
8806
8807 n = PyList_GET_SIZE(keys);
8808 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8809 n);
8810 for (i = 0; i < n; i++) {
8811 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8812 switch (s->state) {
8813 case SSTATE_NOT_INTERNED:
8814 /* XXX Shouldn't happen */
8815 break;
8816 case SSTATE_INTERNED_IMMORTAL:
8817 s->ob_refcnt += 1;
8818 immortal_size += s->length;
8819 break;
8820 case SSTATE_INTERNED_MORTAL:
8821 s->ob_refcnt += 2;
8822 mortal_size += s->length;
8823 break;
8824 default:
8825 Py_FatalError("Inconsistent interned string state.");
8826 }
8827 s->state = SSTATE_NOT_INTERNED;
8828 }
8829 fprintf(stderr, "total size of all interned strings: "
8830 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8831 "mortal/immortal\n", mortal_size, immortal_size);
8832 Py_DECREF(keys);
8833 PyDict_Clear(interned);
8834 Py_DECREF(interned);
8835 interned = NULL;
8836}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008837
8838
8839/********************* Unicode Iterator **************************/
8840
8841typedef struct {
8842 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008843 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008844 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8845} unicodeiterobject;
8846
8847static void
8848unicodeiter_dealloc(unicodeiterobject *it)
8849{
8850 _PyObject_GC_UNTRACK(it);
8851 Py_XDECREF(it->it_seq);
8852 PyObject_GC_Del(it);
8853}
8854
8855static int
8856unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8857{
8858 Py_VISIT(it->it_seq);
8859 return 0;
8860}
8861
8862static PyObject *
8863unicodeiter_next(unicodeiterobject *it)
8864{
8865 PyUnicodeObject *seq;
8866 PyObject *item;
8867
8868 assert(it != NULL);
8869 seq = it->it_seq;
8870 if (seq == NULL)
8871 return NULL;
8872 assert(PyUnicode_Check(seq));
8873
8874 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008875 item = PyUnicode_FromUnicode(
8876 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008877 if (item != NULL)
8878 ++it->it_index;
8879 return item;
8880 }
8881
8882 Py_DECREF(seq);
8883 it->it_seq = NULL;
8884 return NULL;
8885}
8886
8887static PyObject *
8888unicodeiter_len(unicodeiterobject *it)
8889{
8890 Py_ssize_t len = 0;
8891 if (it->it_seq)
8892 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8893 return PyInt_FromSsize_t(len);
8894}
8895
8896PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8897
8898static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008899 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8900 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008901 {NULL, NULL} /* sentinel */
8902};
8903
8904PyTypeObject PyUnicodeIter_Type = {
8905 PyObject_HEAD_INIT(&PyType_Type)
8906 0, /* ob_size */
8907 "unicodeiterator", /* tp_name */
8908 sizeof(unicodeiterobject), /* tp_basicsize */
8909 0, /* tp_itemsize */
8910 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008911 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008912 0, /* tp_print */
8913 0, /* tp_getattr */
8914 0, /* tp_setattr */
8915 0, /* tp_compare */
8916 0, /* tp_repr */
8917 0, /* tp_as_number */
8918 0, /* tp_as_sequence */
8919 0, /* tp_as_mapping */
8920 0, /* tp_hash */
8921 0, /* tp_call */
8922 0, /* tp_str */
8923 PyObject_GenericGetAttr, /* tp_getattro */
8924 0, /* tp_setattro */
8925 0, /* tp_as_buffer */
8926 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8927 0, /* tp_doc */
8928 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8929 0, /* tp_clear */
8930 0, /* tp_richcompare */
8931 0, /* tp_weaklistoffset */
8932 PyObject_SelfIter, /* tp_iter */
8933 (iternextfunc)unicodeiter_next, /* tp_iternext */
8934 unicodeiter_methods, /* tp_methods */
8935 0,
8936};
8937
8938static PyObject *
8939unicode_iter(PyObject *seq)
8940{
8941 unicodeiterobject *it;
8942
8943 if (!PyUnicode_Check(seq)) {
8944 PyErr_BadInternalCall();
8945 return NULL;
8946 }
8947 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8948 if (it == NULL)
8949 return NULL;
8950 it->it_index = 0;
8951 Py_INCREF(seq);
8952 it->it_seq = (PyUnicodeObject *)seq;
8953 _PyObject_GC_TRACK(it);
8954 return (PyObject *)it;
8955}
8956
Martin v. Löwis5b222132007-06-10 09:51:05 +00008957size_t
8958Py_UNICODE_strlen(const Py_UNICODE *u)
8959{
8960 int res = 0;
8961 while(*u++)
8962 res++;
8963 return res;
8964}
8965
8966Py_UNICODE*
8967Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8968{
8969 Py_UNICODE *u = s1;
8970 while ((*u++ = *s2++));
8971 return s1;
8972}
8973
8974Py_UNICODE*
8975Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8976{
8977 Py_UNICODE *u = s1;
8978 while ((*u++ = *s2++))
8979 if (n-- == 0)
8980 break;
8981 return s1;
8982}
8983
8984int
8985Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8986{
8987 while (*s1 && *s2 && *s1 == *s2)
8988 s1++, s2++;
8989 if (*s1 && *s2)
8990 return (*s1 < *s2) ? -1 : +1;
8991 if (*s1)
8992 return 1;
8993 if (*s2)
8994 return -1;
8995 return 0;
8996}
8997
8998Py_UNICODE*
8999Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9000{
9001 const Py_UNICODE *p;
9002 for (p = s; *p; p++)
9003 if (*p == c)
9004 return (Py_UNICODE*)p;
9005 return NULL;
9006}
9007
9008
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009009#ifdef __cplusplus
9010}
9011#endif
9012
9013
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009014/*
9015Local variables:
9016c-basic-offset: 4
9017indent-tabs-mode: nil
9018End:
9019*/