blob: 8288f1ebeada0fbb6ae4972c3a8704fe9c14e044 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Fred Drakee4315f52000-05-09 19:53:39 +0000116/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000117 parameter; it is fixed to "utf-8". Always use the
118 PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000120
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000121Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000122PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000123{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000124#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000125 return 0x10FFFF;
126#else
127 /* This is actually an illegal character, so it should
128 not be passed to unichr. */
129 return 0xFFFF;
130#endif
131}
132
Thomas Wouters477c8d52006-05-27 19:21:47 +0000133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136 to keep things simple, we use a single bitmask, using the least 5
137 bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152 /* calculate simple bloom-style bitmask for a given unicode string */
153
154 long mask;
155 Py_ssize_t i;
156
157 mask = 0;
158 for (i = 0; i < len; i++)
159 mask |= (1 << (ptr[i] & 0x1F));
160
161 return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166 Py_ssize_t i;
167
168 for (i = 0; i < setlen; i++)
169 if (set[i] == chr)
170 return 1;
171
172 return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
Guido van Rossumd57fd912000-03-10 22:53:23 +0000178/* --- Unicode Object ----------------------------------------------------- */
179
180static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000182 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183{
184 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000185
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000186 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000188 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190 /* Resizing shared object (unicode_empty or single character
191 objects) in-place is not allowed. Use PyUnicode_Resize()
192 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000193
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000194 if (unicode == unicode_empty ||
195 (unicode->length == 1 &&
196 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 return -1;
201 }
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 /* We allocate one more byte to make sure the string is Ux0000 terminated.
204 The overallocation is also used by fastsearch, which assumes that it's
205 safe to look at str[length] (without making any assumptions about what
206 it contains). */
207
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 oldstr = unicode->str;
209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 PyErr_NoMemory();
213 return -1;
214 }
215 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000216 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000220 if (unicode->defenc) {
221 Py_DECREF(unicode->defenc);
222 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 }
224 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000225
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 return 0;
227}
228
229/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000230 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231
232 XXX This allocator could further be enhanced by assuring that the
233 free list never reduces its size below 1.
234
235*/
236
237static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000239{
240 register PyUnicodeObject *unicode;
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (length == 0 && unicode_empty != NULL) {
244 Py_INCREF(unicode_empty);
245 return unicode_empty;
246 }
247
248 /* Unicode freelist & memory allocation */
249 if (unicode_freelist) {
250 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000251 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000254 /* Keep-Alive optimization: we only upsize the buffer,
255 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000256 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000257 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000258 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000259 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000262 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000264 }
265 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 }
267 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode == NULL)
270 return NULL;
271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272 }
273
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000274 if (!unicode->str) {
275 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000276 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000277 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000279 * the caller fails before initializing str -- unicode_resize()
280 * reads str[0], and the Keep-Alive optimization can keep memory
281 * allocated for str alive across a call to unicode_dealloc(unicode).
282 * We don't want unicode_resize to read uninitialized memory in
283 * that case.
284 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000285 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000289 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000290 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000292
293 onError:
294 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000295 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297}
298
299static
Guido van Rossum9475a232001-10-05 20:51:39 +0000300void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
Walter Dörwald16807132007-05-25 13:52:07 +0000302 switch (PyUnicode_CHECK_INTERNED(unicode)) {
303 case SSTATE_NOT_INTERNED:
304 break;
305
306 case SSTATE_INTERNED_MORTAL:
307 /* revive dead object temporarily for DelItem */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +0000308 Py_Refcnt(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310 Py_FatalError(
311 "deletion of interned unicode string failed");
312 break;
313
314 case SSTATE_INTERNED_IMMORTAL:
315 Py_FatalError("Immortal interned unicode string died.");
316
317 default:
318 Py_FatalError("Inconsistent interned unicode string state.");
319 }
320
Guido van Rossum604ddf82001-12-06 20:03:56 +0000321 if (PyUnicode_CheckExact(unicode) &&
322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000323 /* Keep-Alive optimization */
324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000325 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode->str = NULL;
327 unicode->length = 0;
328 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000329 if (unicode->defenc) {
330 Py_DECREF(unicode->defenc);
331 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000332 }
333 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 *(PyUnicodeObject **)unicode = unicode_freelist;
335 unicode_freelist = unicode;
336 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000339 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000340 Py_XDECREF(unicode->defenc);
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000341 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 }
343}
344
Martin v. Löwis18e16552006-02-15 17:27:45 +0000345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346{
347 register PyUnicodeObject *v;
348
349 /* Argument checks */
350 if (unicode == NULL) {
351 PyErr_BadInternalCall();
352 return -1;
353 }
354 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +0000355 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000356 PyErr_BadInternalCall();
357 return -1;
358 }
359
360 /* Resizing unicode_empty and single character objects is not
361 possible since these are being shared. We simply return a fresh
362 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000363 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000364 (v == unicode_empty || v->length == 1)) {
365 PyUnicodeObject *w = _PyUnicode_New(length);
366 if (w == NULL)
367 return -1;
368 Py_UNICODE_COPY(w->str, v->str,
369 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000370 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000371 *unicode = (PyObject *)w;
372 return 0;
373 }
374
375 /* Note that we don't have to modify *unicode for unshared Unicode
376 objects, since we can modify them in-place. */
377 return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000385 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 PyUnicodeObject *unicode;
388
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000389 /* If the Unicode data is known at construction time, we can apply
390 some optimizations which share commonly used objects. */
391 if (u != NULL) {
392
393 /* Optimization for empty strings */
394 if (size == 0 && unicode_empty != NULL) {
395 Py_INCREF(unicode_empty);
396 return (PyObject *)unicode_empty;
397 }
398
399 /* Single character Unicode objects in the Latin-1 range are
400 shared when using this constructor */
401 if (size == 1 && *u < 256) {
402 unicode = unicode_latin1[*u];
403 if (!unicode) {
404 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000405 if (!unicode)
406 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000407 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000408 unicode_latin1[*u] = unicode;
409 }
410 Py_INCREF(unicode);
411 return (PyObject *)unicode;
412 }
413 }
Tim Petersced69f82003-09-16 20:30:58 +0000414
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode = _PyUnicode_New(size);
416 if (!unicode)
417 return NULL;
418
419 /* Copy the Unicode data into the new object */
420 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000421 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422
423 return (PyObject *)unicode;
424}
425
Walter Dörwaldd2034312007-05-18 16:29:38 +0000426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000427{
428 PyUnicodeObject *unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000429 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000430 some optimizations which share commonly used objects.
431 Also, this means the input must be UTF-8, so fall back to the
432 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000433 if (u != NULL) {
434
435 /* Optimization for empty strings */
436 if (size == 0 && unicode_empty != NULL) {
437 Py_INCREF(unicode_empty);
438 return (PyObject *)unicode_empty;
439 }
440
Martin v. Löwis9c121062007-08-05 20:26:11 +0000441 /* Single characters are shared when using this constructor.
442 Restrict to ASCII, since the input must be UTF-8. */
443 if (size == 1 && Py_CHARMASK(*u) < 128) {
Guido van Rossum00058aa2007-07-19 18:21:28 +0000444 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000445 if (!unicode) {
446 unicode = _PyUnicode_New(1);
447 if (!unicode)
448 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000449 unicode->str[0] = Py_CHARMASK(*u);
450 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000451 }
452 Py_INCREF(unicode);
453 return (PyObject *)unicode;
454 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000455
456 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000457 }
458
Walter Dörwald55507312007-05-18 13:12:10 +0000459 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000460 if (!unicode)
461 return NULL;
462
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000463 return (PyObject *)unicode;
464}
465
Walter Dörwaldd2034312007-05-18 16:29:38 +0000466PyObject *PyUnicode_FromString(const char *u)
467{
468 size_t size = strlen(u);
469 if (size > PY_SSIZE_T_MAX) {
470 PyErr_SetString(PyExc_OverflowError, "input too long");
471 return NULL;
472 }
473
474 return PyUnicode_FromStringAndSize(u, size);
475}
476
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477#ifdef HAVE_WCHAR_H
478
479PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000480 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000481{
482 PyUnicodeObject *unicode;
483
484 if (w == NULL) {
485 PyErr_BadInternalCall();
486 return NULL;
487 }
488
489 unicode = _PyUnicode_New(size);
490 if (!unicode)
491 return NULL;
492
493 /* Copy the wchar_t data into the new object */
494#ifdef HAVE_USABLE_WCHAR_T
495 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000496#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497 {
498 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000499 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000501 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000502 *u++ = *w++;
503 }
504#endif
505
506 return (PyObject *)unicode;
507}
508
Walter Dörwald346737f2007-05-31 10:44:43 +0000509static void
510makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
511{
512 *fmt++ = '%';
513 if (width) {
514 if (zeropad)
515 *fmt++ = '0';
516 fmt += sprintf(fmt, "%d", width);
517 }
518 if (precision)
519 fmt += sprintf(fmt, ".%d", precision);
520 if (longflag)
521 *fmt++ = 'l';
522 else if (size_tflag) {
523 char *f = PY_FORMAT_SIZE_T;
524 while (*f)
525 *fmt++ = *f++;
526 }
527 *fmt++ = c;
528 *fmt = '\0';
529}
530
Walter Dörwaldd2034312007-05-18 16:29:38 +0000531#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
532
533PyObject *
534PyUnicode_FromFormatV(const char *format, va_list vargs)
535{
536 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000537 Py_ssize_t callcount = 0;
538 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000539 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000540 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000541 int width = 0;
542 int precision = 0;
543 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000544 const char* f;
545 Py_UNICODE *s;
546 PyObject *string;
547 /* used by sprintf */
548 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000549 /* use abuffer instead of buffer, if we need more space
550 * (which can happen if there's a format specifier with width). */
551 char *abuffer = NULL;
552 char *realbuffer;
553 Py_ssize_t abuffersize = 0;
554 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000555 const char *copy;
556
557#ifdef VA_LIST_IS_ARRAY
558 Py_MEMCPY(count, vargs, sizeof(va_list));
559#else
560#ifdef __va_copy
561 __va_copy(count, vargs);
562#else
563 count = vargs;
564#endif
565#endif
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000566 /* step 1: count the number of %S/%R format specifications
567 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
568 * once during step 3 and put the result in an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000569 for (f = format; *f; f++) {
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000570 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000571 ++callcount;
572 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000573 /* step 2: allocate memory for the results of
574 * PyObject_Unicode()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000575 if (callcount) {
576 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
577 if (!callresults) {
578 PyErr_NoMemory();
579 return NULL;
580 }
581 callresult = callresults;
582 }
583 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000584 for (f = format; *f; f++) {
585 if (*f == '%') {
586 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000587 width = 0;
588 while (isdigit(Py_CHARMASK(*f)))
589 width = (width*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000590 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
591 ;
592
593 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
594 * they don't affect the amount of space we reserve.
595 */
596 if ((*f == 'l' || *f == 'z') &&
597 (f[1] == 'd' || f[1] == 'u'))
598 ++f;
599
600 switch (*f) {
601 case 'c':
602 (void)va_arg(count, int);
603 /* fall through... */
604 case '%':
605 n++;
606 break;
607 case 'd': case 'u': case 'i': case 'x':
608 (void) va_arg(count, int);
609 /* 20 bytes is enough to hold a 64-bit
610 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000611 This isn't enough for octal.
612 If a width is specified we need more
613 (which we allocate later). */
614 if (width < 20)
615 width = 20;
616 n += width;
617 if (abuffersize < width)
618 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000619 break;
620 case 's':
621 n += strlen(va_arg(count, char*));
622 break;
623 case 'U':
624 {
625 PyObject *obj = va_arg(count, PyObject *);
626 assert(obj && PyUnicode_Check(obj));
627 n += PyUnicode_GET_SIZE(obj);
628 break;
629 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000630 case 'V':
631 {
632 PyObject *obj = va_arg(count, PyObject *);
633 const char *str = va_arg(count, const char *);
634 assert(obj || str);
635 assert(!obj || PyUnicode_Check(obj));
636 if (obj)
637 n += PyUnicode_GET_SIZE(obj);
638 else
639 n += strlen(str);
640 break;
641 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000642 case 'S':
643 {
644 PyObject *obj = va_arg(count, PyObject *);
645 PyObject *str;
646 assert(obj);
647 str = PyObject_Unicode(obj);
648 if (!str)
649 goto fail;
650 n += PyUnicode_GET_SIZE(str);
651 /* Remember the str and switch to the next slot */
652 *callresult++ = str;
653 break;
654 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000655 case 'R':
656 {
657 PyObject *obj = va_arg(count, PyObject *);
658 PyObject *repr;
659 assert(obj);
660 repr = PyObject_Repr(obj);
661 if (!repr)
662 goto fail;
663 n += PyUnicode_GET_SIZE(repr);
664 /* Remember the repr and switch to the next slot */
665 *callresult++ = repr;
666 break;
667 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000668 case 'p':
669 (void) va_arg(count, int);
670 /* maximum 64-bit pointer representation:
671 * 0xffffffffffffffff
672 * so 19 characters is enough.
673 * XXX I count 18 -- what's the extra for?
674 */
675 n += 19;
676 break;
677 default:
678 /* if we stumble upon an unknown
679 formatting code, copy the rest of
680 the format string to the output
681 string. (we cannot just skip the
682 code, since there's no way to know
683 what's in the argument list) */
684 n += strlen(p);
685 goto expand;
686 }
687 } else
688 n++;
689 }
690 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000691 if (abuffersize > 20) {
692 abuffer = PyMem_Malloc(abuffersize);
693 if (!abuffer) {
694 PyErr_NoMemory();
695 goto fail;
696 }
697 realbuffer = abuffer;
698 }
699 else
700 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000701 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000702 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000703 we don't have to resize the string.
704 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000705 string = PyUnicode_FromUnicode(NULL, n);
706 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000707 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000708
709 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000710 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711
712 for (f = format; *f; f++) {
713 if (*f == '%') {
714 const char* p = f++;
715 int longflag = 0;
716 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000717 zeropad = (*f == '0');
718 /* parse the width.precision part */
719 width = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000720 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000721 width = (width*10) + *f++ - '0';
722 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723 if (*f == '.') {
724 f++;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725 while (isdigit(Py_CHARMASK(*f)))
Walter Dörwald346737f2007-05-31 10:44:43 +0000726 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000727 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728 /* handle the long flag, but only for %ld and %lu.
729 others can be added when necessary. */
730 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
731 longflag = 1;
732 ++f;
733 }
734 /* handle the size_t flag. */
735 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
736 size_tflag = 1;
737 ++f;
738 }
739
740 switch (*f) {
741 case 'c':
742 *s++ = va_arg(vargs, int);
743 break;
744 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000745 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000747 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000749 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000750 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000751 sprintf(realbuffer, fmt, va_arg(vargs, int));
752 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753 break;
754 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000755 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000757 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000758 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000759 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000760 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
762 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000763 break;
764 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000765 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
766 sprintf(realbuffer, fmt, va_arg(vargs, int));
767 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000768 break;
769 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000770 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
771 sprintf(realbuffer, fmt, va_arg(vargs, int));
772 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000773 break;
774 case 's':
775 p = va_arg(vargs, char*);
776 appendstring(p);
777 break;
778 case 'U':
779 {
780 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000781 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
782 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
783 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000784 break;
785 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000786 case 'V':
787 {
788 PyObject *obj = va_arg(vargs, PyObject *);
789 const char *str = va_arg(vargs, const char *);
790 if (obj) {
791 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
792 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
793 s += size;
794 } else {
795 appendstring(str);
796 }
797 break;
798 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000799 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000800 case 'R':
801 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000802 Py_UNICODE *ucopy;
803 Py_ssize_t usize;
804 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000805 /* unused, since we already have the result */
806 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000807 ucopy = PyUnicode_AS_UNICODE(*callresult);
808 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000809 for (upos = 0; upos<usize;)
810 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000811 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000812 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000813 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000814 ++callresult;
815 break;
816 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817 case 'p':
818 sprintf(buffer, "%p", va_arg(vargs, void*));
819 /* %p is ill-defined: ensure leading 0x. */
820 if (buffer[1] == 'X')
821 buffer[1] = 'x';
822 else if (buffer[1] != 'x') {
823 memmove(buffer+2, buffer, strlen(buffer)+1);
824 buffer[0] = '0';
825 buffer[1] = 'x';
826 }
827 appendstring(buffer);
828 break;
829 case '%':
830 *s++ = '%';
831 break;
832 default:
833 appendstring(p);
834 goto end;
835 }
836 } else
837 *s++ = *f;
838 }
839
840 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000841 if (callresults)
842 PyMem_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 if (abuffer)
844 PyMem_Free(abuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000845 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
846 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000847 fail:
848 if (callresults) {
849 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000850 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000851 Py_DECREF(*callresult2);
852 ++callresult2;
853 }
854 PyMem_Free(callresults);
855 }
Walter Dörwald346737f2007-05-31 10:44:43 +0000856 if (abuffer)
857 PyMem_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000858 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859}
860
861#undef appendstring
862
863PyObject *
864PyUnicode_FromFormat(const char *format, ...)
865{
866 PyObject* ret;
867 va_list vargs;
868
869#ifdef HAVE_STDARG_PROTOTYPES
870 va_start(vargs, format);
871#else
872 va_start(vargs);
873#endif
874 ret = PyUnicode_FromFormatV(format, vargs);
875 va_end(vargs);
876 return ret;
877}
878
Martin v. Löwis18e16552006-02-15 17:27:45 +0000879Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
880 wchar_t *w,
881 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000882{
883 if (unicode == NULL) {
884 PyErr_BadInternalCall();
885 return -1;
886 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000887
888 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000889 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000890 size = PyUnicode_GET_SIZE(unicode) + 1;
891
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892#ifdef HAVE_USABLE_WCHAR_T
893 memcpy(w, unicode->str, size * sizeof(wchar_t));
894#else
895 {
896 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000897 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000899 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900 *w++ = *u++;
901 }
902#endif
903
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000904 if (size > PyUnicode_GET_SIZE(unicode))
905 return PyUnicode_GET_SIZE(unicode);
906 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000907 return size;
908}
909
910#endif
911
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000912PyObject *PyUnicode_FromOrdinal(int ordinal)
913{
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000914 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000915
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000916 if (ordinal < 0 || ordinal > 0x10ffff) {
917 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000918 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000919 return NULL;
920 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +0000921
922#ifndef Py_UNICODE_WIDE
923 if (ordinal > 0xffff) {
924 ordinal -= 0x10000;
925 s[0] = 0xD800 | (ordinal >> 10);
926 s[1] = 0xDC00 | (ordinal & 0x3FF);
927 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000928 }
929#endif
930
Hye-Shik Chang40574832004-04-06 07:24:51 +0000931 s[0] = (Py_UNICODE)ordinal;
932 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000933}
934
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935PyObject *PyUnicode_FromObject(register PyObject *obj)
936{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000937 /* XXX Perhaps we should make this API an alias of
938 PyObject_Unicode() instead ?! */
939 if (PyUnicode_CheckExact(obj)) {
940 Py_INCREF(obj);
941 return obj;
942 }
943 if (PyUnicode_Check(obj)) {
944 /* For a Unicode subtype that's not a Unicode object,
945 return a true Unicode object with the same data. */
946 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
947 PyUnicode_GET_SIZE(obj));
948 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000949 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
950}
951
952PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
953 const char *encoding,
954 const char *errors)
955{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000956 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000957 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000958 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000959
Guido van Rossumd57fd912000-03-10 22:53:23 +0000960 if (obj == NULL) {
961 PyErr_BadInternalCall();
962 return NULL;
963 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000964
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000965#if 0
966 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000967 that no encodings is given and then redirect to
968 PyObject_Unicode() which then applies the additional logic for
969 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000970
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000971 NOTE: This API should really only be used for object which
972 represent *encoded* Unicode !
973
974 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000975 if (PyUnicode_Check(obj)) {
976 if (encoding) {
977 PyErr_SetString(PyExc_TypeError,
978 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000979 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000980 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000981 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000982 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000983#else
984 if (PyUnicode_Check(obj)) {
985 PyErr_SetString(PyExc_TypeError,
986 "decoding Unicode is not supported");
987 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000988 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000989#endif
990
991 /* Coerce object */
992 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000993 s = PyString_AS_STRING(obj);
994 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000995 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000996 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
997 /* Overwrite the error message with something more useful in
998 case of a TypeError. */
999 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001000 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001001 "coercing to Unicode: need string or buffer, "
1002 "%.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001003 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001004 goto onError;
1005 }
Tim Petersced69f82003-09-16 20:30:58 +00001006
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001007 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 if (len == 0) {
1009 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001010 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011 }
Tim Petersced69f82003-09-16 20:30:58 +00001012 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001013 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001014
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001015 return v;
1016
1017 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019}
1020
1021PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001022 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 const char *encoding,
1024 const char *errors)
1025{
1026 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001027
1028 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001029 encoding = PyUnicode_GetDefaultEncoding();
1030
1031 /* Shortcuts for common default encodings */
1032 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001034 else if (strcmp(encoding, "latin-1") == 0)
1035 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001036#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1037 else if (strcmp(encoding, "mbcs") == 0)
1038 return PyUnicode_DecodeMBCS(s, size, errors);
1039#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001040 else if (strcmp(encoding, "ascii") == 0)
1041 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 /* Decode via the codec registry */
1044 buffer = PyBuffer_FromMemory((void *)s, size);
1045 if (buffer == NULL)
1046 goto onError;
1047 unicode = PyCodec_Decode(buffer, encoding, errors);
1048 if (unicode == NULL)
1049 goto onError;
1050 if (!PyUnicode_Check(unicode)) {
1051 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001052 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00001053 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 Py_DECREF(unicode);
1055 goto onError;
1056 }
1057 Py_DECREF(buffer);
1058 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001059
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 onError:
1061 Py_XDECREF(buffer);
1062 return NULL;
1063}
1064
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001065PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1066 const char *encoding,
1067 const char *errors)
1068{
1069 PyObject *v;
1070
1071 if (!PyUnicode_Check(unicode)) {
1072 PyErr_BadArgument();
1073 goto onError;
1074 }
1075
1076 if (encoding == NULL)
1077 encoding = PyUnicode_GetDefaultEncoding();
1078
1079 /* Decode via the codec registry */
1080 v = PyCodec_Decode(unicode, encoding, errors);
1081 if (v == NULL)
1082 goto onError;
1083 return v;
1084
1085 onError:
1086 return NULL;
1087}
1088
Guido van Rossumd57fd912000-03-10 22:53:23 +00001089PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 const char *encoding,
1092 const char *errors)
1093{
1094 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001095
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 unicode = PyUnicode_FromUnicode(s, size);
1097 if (unicode == NULL)
1098 return NULL;
1099 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1100 Py_DECREF(unicode);
1101 return v;
1102}
1103
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001104PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1105 const char *encoding,
1106 const char *errors)
1107{
1108 PyObject *v;
1109
1110 if (!PyUnicode_Check(unicode)) {
1111 PyErr_BadArgument();
1112 goto onError;
1113 }
1114
1115 if (encoding == NULL)
1116 encoding = PyUnicode_GetDefaultEncoding();
1117
1118 /* Encode via the codec registry */
1119 v = PyCodec_Encode(unicode, encoding, errors);
1120 if (v == NULL)
1121 goto onError;
1122 return v;
1123
1124 onError:
1125 return NULL;
1126}
1127
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1129 const char *encoding,
1130 const char *errors)
1131{
1132 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001133
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 if (!PyUnicode_Check(unicode)) {
1135 PyErr_BadArgument();
1136 goto onError;
1137 }
Fred Drakee4315f52000-05-09 19:53:39 +00001138
Tim Petersced69f82003-09-16 20:30:58 +00001139 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001140 encoding = PyUnicode_GetDefaultEncoding();
1141
1142 /* Shortcuts for common default encodings */
1143 if (errors == NULL) {
1144 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001145 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001146 else if (strcmp(encoding, "latin-1") == 0)
1147 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001148#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1149 else if (strcmp(encoding, "mbcs") == 0)
1150 return PyUnicode_AsMBCSString(unicode);
1151#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001152 else if (strcmp(encoding, "ascii") == 0)
1153 return PyUnicode_AsASCIIString(unicode);
1154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155
1156 /* Encode via the codec registry */
1157 v = PyCodec_Encode(unicode, encoding, errors);
1158 if (v == NULL)
1159 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001160 if (!PyBytes_Check(v)) {
1161 if (PyString_Check(v)) {
1162 /* Old codec, turn it into bytes */
1163 PyObject *b = PyBytes_FromObject(v);
1164 Py_DECREF(v);
1165 return b;
1166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001168 "encoder did not return a bytes object "
1169 "(type=%.400s, encoding=%.20s, errors=%.20s)",
1170 v->ob_type->tp_name,
1171 encoding ? encoding : "NULL",
1172 errors ? errors : "NULL");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 Py_DECREF(v);
1174 goto onError;
1175 }
1176 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 onError:
1179 return NULL;
1180}
1181
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001182PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1183 const char *errors)
1184{
1185 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001186 PyObject *b;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001187 if (v)
1188 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001189 if (errors != NULL)
1190 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1191 if (errors == NULL) {
1192 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1193 PyUnicode_GET_SIZE(unicode),
1194 NULL);
1195 }
1196 else {
1197 b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1198 }
1199 if (!b)
1200 return NULL;
1201 v = PyString_FromStringAndSize(PyBytes_AsString(b),
1202 PyBytes_Size(b));
1203 Py_DECREF(b);
Guido van Rossume7a0d392007-07-12 07:53:00 +00001204 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001205 return v;
1206}
1207
Martin v. Löwis5b222132007-06-10 09:51:05 +00001208char*
1209PyUnicode_AsString(PyObject *unicode)
1210{
1211 assert(PyUnicode_Check(unicode));
1212 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1213 if (!unicode)
1214 return NULL;
1215 return PyString_AsString(unicode);
1216}
1217
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1219{
1220 if (!PyUnicode_Check(unicode)) {
1221 PyErr_BadArgument();
1222 goto onError;
1223 }
1224 return PyUnicode_AS_UNICODE(unicode);
1225
1226 onError:
1227 return NULL;
1228}
1229
Martin v. Löwis18e16552006-02-15 17:27:45 +00001230Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231{
1232 if (!PyUnicode_Check(unicode)) {
1233 PyErr_BadArgument();
1234 goto onError;
1235 }
1236 return PyUnicode_GET_SIZE(unicode);
1237
1238 onError:
1239 return -1;
1240}
1241
Thomas Wouters78890102000-07-22 19:25:51 +00001242const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001243{
1244 return unicode_default_encoding;
1245}
1246
1247int PyUnicode_SetDefaultEncoding(const char *encoding)
1248{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001249 if (strcmp(encoding, unicode_default_encoding) != 0) {
1250 PyErr_Format(PyExc_ValueError,
1251 "Can only set default encoding to %s",
1252 unicode_default_encoding);
1253 return -1;
1254 }
Fred Drakee4315f52000-05-09 19:53:39 +00001255 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001256}
1257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001258/* error handling callback helper:
1259 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001260 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001261 and adjust various state variables.
1262 return 0 on success, -1 on error
1263*/
1264
1265static
1266int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1267 const char *encoding, const char *reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001268 const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001269 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001270{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001271 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272
1273 PyObject *restuple = NULL;
1274 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001275 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001276 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001277 Py_ssize_t requiredsize;
1278 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001279 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001280 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001281 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001282 int res = -1;
1283
1284 if (*errorHandler == NULL) {
1285 *errorHandler = PyCodec_LookupError(errors);
1286 if (*errorHandler == NULL)
1287 goto onError;
1288 }
1289
1290 if (*exceptionObject == NULL) {
1291 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001292 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001293 if (*exceptionObject == NULL)
1294 goto onError;
1295 }
1296 else {
1297 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1298 goto onError;
1299 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1300 goto onError;
1301 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1302 goto onError;
1303 }
1304
1305 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1306 if (restuple == NULL)
1307 goto onError;
1308 if (!PyTuple_Check(restuple)) {
1309 PyErr_Format(PyExc_TypeError, &argparse[4]);
1310 goto onError;
1311 }
1312 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1313 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001314
1315 /* Copy back the bytes variables, which might have been modified by the
1316 callback */
1317 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1318 if (!inputobj)
1319 goto onError;
1320 if (!PyBytes_Check(inputobj)) {
1321 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1322 }
1323 *input = PyBytes_AS_STRING(inputobj);
1324 insize = PyBytes_GET_SIZE(inputobj);
1325 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001326 /* we can DECREF safely, as the exception has another reference,
1327 so the object won't go away. */
1328 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001330 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001331 newpos = insize+newpos;
1332 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001333 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001334 goto onError;
1335 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001336
1337 /* need more space? (at least enough for what we
1338 have+the replacement+the rest of the string (starting
1339 at the new input position), so we won't have to check space
1340 when there are no errors in the rest of the string) */
1341 repptr = PyUnicode_AS_UNICODE(repunicode);
1342 repsize = PyUnicode_GET_SIZE(repunicode);
1343 requiredsize = *outpos + repsize + insize-newpos;
1344 if (requiredsize > outsize) {
1345 if (requiredsize<2*outsize)
1346 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001347 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001348 goto onError;
1349 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1350 }
1351 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001352 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001353 Py_UNICODE_COPY(*outptr, repptr, repsize);
1354 *outptr += repsize;
1355 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001356
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001357 /* we made it! */
1358 res = 0;
1359
1360 onError:
1361 Py_XDECREF(restuple);
1362 return res;
1363}
1364
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001365/* --- UTF-7 Codec -------------------------------------------------------- */
1366
1367/* see RFC2152 for details */
1368
Tim Petersced69f82003-09-16 20:30:58 +00001369static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001370char utf7_special[128] = {
1371 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1372 encoded:
1373 0 - not special
1374 1 - special
1375 2 - whitespace (optional)
1376 3 - RFC2152 Set O (optional) */
1377 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1378 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1379 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1380 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1381 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1382 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1383 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1384 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1385
1386};
1387
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001388/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1389 warnings about the comparison always being false; since
1390 utf7_special[0] is 1, we can safely make that one comparison
1391 true */
1392
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001393#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001394 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001395 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001396 (encodeO && (utf7_special[(c)] == 3)))
1397
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001398#define B64(n) \
1399 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1400#define B64CHAR(c) \
1401 (isalnum(c) || (c) == '+' || (c) == '/')
1402#define UB64(c) \
1403 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1404 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001405
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001406#define ENCODE(out, ch, bits) \
1407 while (bits >= 6) { \
1408 *out++ = B64(ch >> (bits-6)); \
1409 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001410 }
1411
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001412#define DECODE(out, ch, bits, surrogate) \
1413 while (bits >= 16) { \
1414 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1415 bits -= 16; \
1416 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001417 /* We have already generated an error for the high surrogate \
1418 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001419 surrogate = 0; \
1420 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001421 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001422 it in a 16-bit character */ \
1423 surrogate = 1; \
1424 errmsg = "code pairs are not supported"; \
1425 goto utf7Error; \
1426 } else { \
1427 *out++ = outCh; \
1428 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001429 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001430
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001431PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001432 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001433 const char *errors)
1434{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001436 Py_ssize_t startinpos;
1437 Py_ssize_t endinpos;
1438 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001439 const char *e;
1440 PyUnicodeObject *unicode;
1441 Py_UNICODE *p;
1442 const char *errmsg = "";
1443 int inShift = 0;
1444 unsigned int bitsleft = 0;
1445 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 int surrogate = 0;
1447 PyObject *errorHandler = NULL;
1448 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001449
1450 unicode = _PyUnicode_New(size);
1451 if (!unicode)
1452 return NULL;
1453 if (size == 0)
1454 return (PyObject *)unicode;
1455
1456 p = unicode->str;
1457 e = s + size;
1458
1459 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001460 Py_UNICODE ch;
1461 restart:
1462 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001463
1464 if (inShift) {
1465 if ((ch == '-') || !B64CHAR(ch)) {
1466 inShift = 0;
1467 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001468
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1470 if (bitsleft >= 6) {
1471 /* The shift sequence has a partial character in it. If
1472 bitsleft < 6 then we could just classify it as padding
1473 but that is not the case here */
1474
1475 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001476 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001477 }
1478 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001479 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480 here so indicate the potential of a misencoded character. */
1481
1482 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1483 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1484 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001485 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001486 }
1487
1488 if (ch == '-') {
1489 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001490 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001491 inShift = 1;
1492 }
1493 } else if (SPECIAL(ch,0,0)) {
1494 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001495 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001496 } else {
1497 *p++ = ch;
1498 }
1499 } else {
1500 charsleft = (charsleft << 6) | UB64(ch);
1501 bitsleft += 6;
1502 s++;
1503 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1504 }
1505 }
1506 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001507 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001508 s++;
1509 if (s < e && *s == '-') {
1510 s++;
1511 *p++ = '+';
1512 } else
1513 {
1514 inShift = 1;
1515 bitsleft = 0;
1516 }
1517 }
1518 else if (SPECIAL(ch,0,0)) {
1519 errmsg = "unexpected special character";
1520 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001521 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001522 }
1523 else {
1524 *p++ = ch;
1525 s++;
1526 }
1527 continue;
1528 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 outpos = p-PyUnicode_AS_UNICODE(unicode);
1530 endinpos = s-starts;
1531 if (unicode_decode_call_errorhandler(
1532 errors, &errorHandler,
1533 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001534 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001535 (PyObject **)&unicode, &outpos, &p))
1536 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001537 }
1538
1539 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 outpos = p-PyUnicode_AS_UNICODE(unicode);
1541 endinpos = size;
1542 if (unicode_decode_call_errorhandler(
1543 errors, &errorHandler,
1544 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001545 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001548 if (s < e)
1549 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550 }
1551
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001552 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 goto onError;
1554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555 Py_XDECREF(errorHandler);
1556 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557 return (PyObject *)unicode;
1558
1559onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001560 Py_XDECREF(errorHandler);
1561 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001562 Py_DECREF(unicode);
1563 return NULL;
1564}
1565
1566
1567PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001568 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001569 int encodeSetO,
1570 int encodeWhiteSpace,
1571 const char *errors)
1572{
1573 PyObject *v;
1574 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001575 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001576 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001577 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001578 unsigned int bitsleft = 0;
1579 unsigned long charsleft = 0;
1580 char * out;
1581 char * start;
1582
1583 if (size == 0)
Walter Dörwald51ab4142007-05-05 14:43:36 +00001584 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001585
Walter Dörwald51ab4142007-05-05 14:43:36 +00001586 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001587 if (v == NULL)
1588 return NULL;
1589
Walter Dörwald51ab4142007-05-05 14:43:36 +00001590 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001591 for (;i < size; ++i) {
1592 Py_UNICODE ch = s[i];
1593
1594 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001595 if (ch == '+') {
1596 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597 *out++ = '-';
1598 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1599 charsleft = ch;
1600 bitsleft = 16;
1601 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001602 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001603 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001604 } else {
1605 *out++ = (char) ch;
1606 }
1607 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1609 *out++ = B64(charsleft << (6-bitsleft));
1610 charsleft = 0;
1611 bitsleft = 0;
1612 /* Characters not in the BASE64 set implicitly unshift the sequence
1613 so no '-' is required, except if the character is itself a '-' */
1614 if (B64CHAR(ch) || ch == '-') {
1615 *out++ = '-';
1616 }
1617 inShift = 0;
1618 *out++ = (char) ch;
1619 } else {
1620 bitsleft += 16;
1621 charsleft = (charsleft << 16) | ch;
1622 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1623
1624 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001625 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001626 or '-' then the shift sequence will be terminated implicitly and we
1627 don't have to insert a '-'. */
1628
1629 if (bitsleft == 0) {
1630 if (i + 1 < size) {
1631 Py_UNICODE ch2 = s[i+1];
1632
1633 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001634
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001635 } else if (B64CHAR(ch2) || ch2 == '-') {
1636 *out++ = '-';
1637 inShift = 0;
1638 } else {
1639 inShift = 0;
1640 }
1641
1642 }
1643 else {
1644 *out++ = '-';
1645 inShift = 0;
1646 }
1647 }
Tim Petersced69f82003-09-16 20:30:58 +00001648 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001649 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001650 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001651 if (bitsleft) {
1652 *out++= B64(charsleft << (6-bitsleft) );
1653 *out++ = '-';
1654 }
1655
Walter Dörwald51ab4142007-05-05 14:43:36 +00001656 if (PyBytes_Resize(v, out - start)) {
1657 Py_DECREF(v);
1658 return NULL;
1659 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001660 return v;
1661}
1662
1663#undef SPECIAL
1664#undef B64
1665#undef B64CHAR
1666#undef UB64
1667#undef ENCODE
1668#undef DECODE
1669
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670/* --- UTF-8 Codec -------------------------------------------------------- */
1671
Tim Petersced69f82003-09-16 20:30:58 +00001672static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673char utf8_code_length[256] = {
1674 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1675 illegal prefix. see RFC 2279 for details */
1676 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1688 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1689 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1690 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1691 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1692};
1693
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001695 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 const char *errors)
1697{
Walter Dörwald69652032004-09-07 20:24:22 +00001698 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1699}
1700
1701PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001702 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001703 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001704 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001705{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001708 Py_ssize_t startinpos;
1709 Py_ssize_t endinpos;
1710 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 const char *e;
1712 PyUnicodeObject *unicode;
1713 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001714 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001715 PyObject *errorHandler = NULL;
1716 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717
1718 /* Note: size will always be longer than the resulting Unicode
1719 character count */
1720 unicode = _PyUnicode_New(size);
1721 if (!unicode)
1722 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001723 if (size == 0) {
1724 if (consumed)
1725 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728
1729 /* Unpack UTF-8 encoded data */
1730 p = unicode->str;
1731 e = s + size;
1732
1733 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001734 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735
1736 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001737 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 s++;
1739 continue;
1740 }
1741
1742 n = utf8_code_length[ch];
1743
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001744 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001745 if (consumed)
1746 break;
1747 else {
1748 errmsg = "unexpected end of data";
1749 startinpos = s-starts;
1750 endinpos = size;
1751 goto utf8Error;
1752 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 switch (n) {
1756
1757 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001758 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001759 startinpos = s-starts;
1760 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001761 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762
1763 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001764 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001765 startinpos = s-starts;
1766 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001767 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768
1769 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001770 if ((s[1] & 0xc0) != 0x80) {
1771 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001772 startinpos = s-starts;
1773 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001774 goto utf8Error;
1775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001777 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 startinpos = s-starts;
1779 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001780 errmsg = "illegal encoding";
1781 goto utf8Error;
1782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001784 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 break;
1786
1787 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001788 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001789 (s[2] & 0xc0) != 0x80) {
1790 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001791 startinpos = s-starts;
1792 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001793 goto utf8Error;
1794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001796 if (ch < 0x0800) {
1797 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001798 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001799
1800 XXX For wide builds (UCS-4) we should probably try
1801 to recombine the surrogates into a single code
1802 unit.
1803 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001804 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 startinpos = s-starts;
1806 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001807 goto utf8Error;
1808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001810 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001811 break;
1812
1813 case 4:
1814 if ((s[1] & 0xc0) != 0x80 ||
1815 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001816 (s[3] & 0xc0) != 0x80) {
1817 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
1819 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001820 goto utf8Error;
1821 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001822 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1823 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1824 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001825 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001826 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001827 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001828 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001829 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001830 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 startinpos = s-starts;
1832 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001833 goto utf8Error;
1834 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001835#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001836 *p++ = (Py_UNICODE)ch;
1837#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001838 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001839
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001840 /* translate from 10000..10FFFF to 0..FFFF */
1841 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001842
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001843 /* high surrogate = top 10 bits added to D800 */
1844 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001845
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001846 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001847 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001848#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 break;
1850
1851 default:
1852 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001853 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001854 startinpos = s-starts;
1855 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001856 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 }
1858 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001859 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001860
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001861 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(unicode);
1863 if (unicode_decode_call_errorhandler(
1864 errors, &errorHandler,
1865 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001866 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 (PyObject **)&unicode, &outpos, &p))
1868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869 }
Walter Dörwald69652032004-09-07 20:24:22 +00001870 if (consumed)
1871 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872
1873 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001874 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 goto onError;
1876
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 Py_XDECREF(errorHandler);
1878 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 return (PyObject *)unicode;
1880
1881onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001882 Py_XDECREF(errorHandler);
1883 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884 Py_DECREF(unicode);
1885 return NULL;
1886}
1887
Tim Peters602f7402002-04-27 18:03:26 +00001888/* Allocation strategy: if the string is short, convert into a stack buffer
1889 and allocate exactly as much space needed at the end. Else allocate the
1890 maximum possible needed (4 result bytes per Unicode character), and return
1891 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001892*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001893PyObject *
1894PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001895 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897{
Tim Peters602f7402002-04-27 18:03:26 +00001898#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001899
Martin v. Löwis18e16552006-02-15 17:27:45 +00001900 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001901 PyObject *v; /* result string object */
1902 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001903 Py_ssize_t nallocated; /* number of result bytes allocated */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001904 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001905 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001906
Tim Peters602f7402002-04-27 18:03:26 +00001907 assert(s != NULL);
1908 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909
Tim Peters602f7402002-04-27 18:03:26 +00001910 if (size <= MAX_SHORT_UNICHARS) {
1911 /* Write into the stack buffer; nallocated can't overflow.
1912 * At the end, we'll allocate exactly as much heap space as it
1913 * turns out we need.
1914 */
1915 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1916 v = NULL; /* will allocate after we're done */
1917 p = stackbuf;
1918 }
1919 else {
1920 /* Overallocate on the heap, and give the excess back at the end. */
1921 nallocated = size * 4;
1922 if (nallocated / 4 != size) /* overflow! */
1923 return PyErr_NoMemory();
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001924 v = PyBytes_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00001925 if (v == NULL)
1926 return NULL;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001927 p = PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001928 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001929
Tim Peters602f7402002-04-27 18:03:26 +00001930 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001931 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001932
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001933 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001934 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001936
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001938 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001939 *p++ = (char)(0xc0 | (ch >> 6));
1940 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001941 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001942 else {
Tim Peters602f7402002-04-27 18:03:26 +00001943 /* Encode UCS2 Unicode ordinals */
1944 if (ch < 0x10000) {
1945 /* Special case: check for high surrogate */
1946 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1947 Py_UCS4 ch2 = s[i];
1948 /* Check for low surrogate and combine the two to
1949 form a UCS4 value */
1950 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001951 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001952 i++;
1953 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001954 }
Tim Peters602f7402002-04-27 18:03:26 +00001955 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001956 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001957 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001958 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1959 *p++ = (char)(0x80 | (ch & 0x3f));
1960 continue;
1961 }
1962encodeUCS4:
1963 /* Encode UCS4 Unicode ordinals */
1964 *p++ = (char)(0xf0 | (ch >> 18));
1965 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1966 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1967 *p++ = (char)(0x80 | (ch & 0x3f));
1968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001970
Tim Peters602f7402002-04-27 18:03:26 +00001971 if (v == NULL) {
1972 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001973 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001974 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001975 v = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001976 }
1977 else {
1978 /* Cut back to size actually needed. */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001979 nneeded = p - PyBytes_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001980 assert(nneeded <= nallocated);
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001981 PyBytes_Resize(v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00001982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001984
Tim Peters602f7402002-04-27 18:03:26 +00001985#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986}
1987
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1989{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 if (!PyUnicode_Check(unicode)) {
1991 PyErr_BadArgument();
1992 return NULL;
1993 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001994 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1995 PyUnicode_GET_SIZE(unicode),
1996 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997}
1998
1999/* --- UTF-16 Codec ------------------------------------------------------- */
2000
Tim Peters772747b2001-08-09 22:21:55 +00002001PyObject *
2002PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002003 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002004 const char *errors,
2005 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Walter Dörwald69652032004-09-07 20:24:22 +00002007 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2008}
2009
2010PyObject *
2011PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002012 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002013 const char *errors,
2014 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002015 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002018 Py_ssize_t startinpos;
2019 Py_ssize_t endinpos;
2020 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 PyUnicodeObject *unicode;
2022 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002023 const unsigned char *q, *e;
2024 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002025 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002026 /* Offsets from q for retrieving byte pairs in the right order. */
2027#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2028 int ihi = 1, ilo = 0;
2029#else
2030 int ihi = 0, ilo = 1;
2031#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 PyObject *errorHandler = NULL;
2033 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
2035 /* Note: size will always be longer than the resulting Unicode
2036 character count */
2037 unicode = _PyUnicode_New(size);
2038 if (!unicode)
2039 return NULL;
2040 if (size == 0)
2041 return (PyObject *)unicode;
2042
2043 /* Unpack UTF-16 encoded data */
2044 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002045 q = (unsigned char *)s;
2046 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047
2048 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002049 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002051 /* Check for BOM marks (U+FEFF) in the input and adjust current
2052 byte order setting accordingly. In native mode, the leading BOM
2053 mark is skipped, in all other modes, it is copied to the output
2054 stream as-is (giving a ZWNBSP character). */
2055 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002056 if (size >= 2) {
2057 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002058#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002059 if (bom == 0xFEFF) {
2060 q += 2;
2061 bo = -1;
2062 }
2063 else if (bom == 0xFFFE) {
2064 q += 2;
2065 bo = 1;
2066 }
Tim Petersced69f82003-09-16 20:30:58 +00002067#else
Walter Dörwald69652032004-09-07 20:24:22 +00002068 if (bom == 0xFEFF) {
2069 q += 2;
2070 bo = 1;
2071 }
2072 else if (bom == 0xFFFE) {
2073 q += 2;
2074 bo = -1;
2075 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002076#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002077 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079
Tim Peters772747b2001-08-09 22:21:55 +00002080 if (bo == -1) {
2081 /* force LE */
2082 ihi = 1;
2083 ilo = 0;
2084 }
2085 else if (bo == 1) {
2086 /* force BE */
2087 ihi = 0;
2088 ilo = 1;
2089 }
2090
2091 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002092 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00002093 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00002095 if (consumed)
2096 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002097 errmsg = "truncated data";
2098 startinpos = ((const char *)q)-starts;
2099 endinpos = ((const char *)e)-starts;
2100 goto utf16Error;
2101 /* The remaining input chars are ignored if the callback
2102 chooses to skip the input */
2103 }
2104 ch = (q[ihi] << 8) | q[ilo];
2105
Tim Peters772747b2001-08-09 22:21:55 +00002106 q += 2;
2107
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 if (ch < 0xD800 || ch > 0xDFFF) {
2109 *p++ = ch;
2110 continue;
2111 }
2112
2113 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002114 if (q >= e) {
2115 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002116 startinpos = (((const char *)q)-2)-starts;
2117 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002118 goto utf16Error;
2119 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002120 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002121 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2122 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002123 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002124#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002125 *p++ = ch;
2126 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002127#else
2128 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002129#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002130 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002131 }
2132 else {
2133 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 startinpos = (((const char *)q)-4)-starts;
2135 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002136 goto utf16Error;
2137 }
2138
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002140 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002141 startinpos = (((const char *)q)-2)-starts;
2142 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002143 /* Fall through to report the error */
2144
2145 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002146 outpos = p-PyUnicode_AS_UNICODE(unicode);
2147 if (unicode_decode_call_errorhandler(
2148 errors, &errorHandler,
2149 "utf16", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002150 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002152 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 }
2154
2155 if (byteorder)
2156 *byteorder = bo;
2157
Walter Dörwald69652032004-09-07 20:24:22 +00002158 if (consumed)
2159 *consumed = (const char *)q-starts;
2160
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002162 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 goto onError;
2164
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 Py_XDECREF(errorHandler);
2166 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 return (PyObject *)unicode;
2168
2169onError:
2170 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002171 Py_XDECREF(errorHandler);
2172 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 return NULL;
2174}
2175
Tim Peters772747b2001-08-09 22:21:55 +00002176PyObject *
2177PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002178 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002179 const char *errors,
2180 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181{
2182 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002183 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002184#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00002185 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002186#else
2187 const int pairs = 0;
2188#endif
Tim Peters772747b2001-08-09 22:21:55 +00002189 /* Offsets from p for storing byte pairs in the right order. */
2190#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2191 int ihi = 1, ilo = 0;
2192#else
2193 int ihi = 0, ilo = 1;
2194#endif
2195
2196#define STORECHAR(CH) \
2197 do { \
2198 p[ihi] = ((CH) >> 8) & 0xff; \
2199 p[ilo] = (CH) & 0xff; \
2200 p += 2; \
2201 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002203#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002204 for (i = pairs = 0; i < size; i++)
2205 if (s[i] >= 0x10000)
2206 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002207#endif
Walter Dörwald3cc34522007-05-04 10:48:27 +00002208 v = PyBytes_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00002209 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 if (v == NULL)
2211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
Walter Dörwald3cc34522007-05-04 10:48:27 +00002213 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002215 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002216 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002217 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002218
2219 if (byteorder == -1) {
2220 /* force LE */
2221 ihi = 1;
2222 ilo = 0;
2223 }
2224 else if (byteorder == 1) {
2225 /* force BE */
2226 ihi = 0;
2227 ilo = 1;
2228 }
2229
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002230 while (size-- > 0) {
2231 Py_UNICODE ch = *s++;
2232 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002233#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002234 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002235 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2236 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002238#endif
Tim Peters772747b2001-08-09 22:21:55 +00002239 STORECHAR(ch);
2240 if (ch2)
2241 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002244#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245}
2246
2247PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2248{
2249 if (!PyUnicode_Check(unicode)) {
2250 PyErr_BadArgument();
2251 return NULL;
2252 }
2253 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2254 PyUnicode_GET_SIZE(unicode),
2255 NULL,
2256 0);
2257}
2258
2259/* --- Unicode Escape Codec ----------------------------------------------- */
2260
Fredrik Lundh06d12682001-01-24 07:59:11 +00002261static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002262
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002264 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 const char *errors)
2266{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002268 Py_ssize_t startinpos;
2269 Py_ssize_t endinpos;
2270 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002275 char* message;
2276 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002277 PyObject *errorHandler = NULL;
2278 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002279
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 /* Escaped strings will always be longer than the resulting
2281 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002282 length after conversion to the true value.
2283 (but if the error callback returns a long replacement string
2284 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285 v = _PyUnicode_New(size);
2286 if (v == NULL)
2287 goto onError;
2288 if (size == 0)
2289 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002293
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294 while (s < end) {
2295 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002296 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002297 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298
2299 /* Non-escape characters are interpreted as Unicode ordinals */
2300 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002301 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 continue;
2303 }
2304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002305 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 /* \ - Escapes */
2307 s++;
2308 switch (*s++) {
2309
2310 /* \x escapes */
2311 case '\n': break;
2312 case '\\': *p++ = '\\'; break;
2313 case '\'': *p++ = '\''; break;
2314 case '\"': *p++ = '\"'; break;
2315 case 'b': *p++ = '\b'; break;
2316 case 'f': *p++ = '\014'; break; /* FF */
2317 case 't': *p++ = '\t'; break;
2318 case 'n': *p++ = '\n'; break;
2319 case 'r': *p++ = '\r'; break;
2320 case 'v': *p++ = '\013'; break; /* VT */
2321 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2322
2323 /* \OOO (octal) escapes */
2324 case '0': case '1': case '2': case '3':
2325 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002326 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002328 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002330 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002331 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002332 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002333 break;
2334
Fredrik Lundhccc74732001-02-18 22:13:49 +00002335 /* hex escapes */
2336 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002338 digits = 2;
2339 message = "truncated \\xXX escape";
2340 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341
Fredrik Lundhccc74732001-02-18 22:13:49 +00002342 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002344 digits = 4;
2345 message = "truncated \\uXXXX escape";
2346 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347
Fredrik Lundhccc74732001-02-18 22:13:49 +00002348 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002349 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002350 digits = 8;
2351 message = "truncated \\UXXXXXXXX escape";
2352 hexescape:
2353 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002354 outpos = p-PyUnicode_AS_UNICODE(v);
2355 if (s+digits>end) {
2356 endinpos = size;
2357 if (unicode_decode_call_errorhandler(
2358 errors, &errorHandler,
2359 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002360 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 (PyObject **)&v, &outpos, &p))
2362 goto onError;
2363 goto nextByte;
2364 }
2365 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002366 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002367 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002368 endinpos = (s+i+1)-starts;
2369 if (unicode_decode_call_errorhandler(
2370 errors, &errorHandler,
2371 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002372 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002374 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002375 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002376 }
2377 chr = (chr<<4) & ~0xF;
2378 if (c >= '0' && c <= '9')
2379 chr += c - '0';
2380 else if (c >= 'a' && c <= 'f')
2381 chr += 10 + c - 'a';
2382 else
2383 chr += 10 + c - 'A';
2384 }
2385 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002386 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002387 /* _decoding_error will have already written into the
2388 target buffer. */
2389 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002390 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002391 /* when we get here, chr is a 32-bit unicode character */
2392 if (chr <= 0xffff)
2393 /* UCS-2 character */
2394 *p++ = (Py_UNICODE) chr;
2395 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002396 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002397 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002398#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002399 *p++ = chr;
2400#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002401 chr -= 0x10000L;
2402 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002403 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002404#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002405 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002406 endinpos = s-starts;
2407 outpos = p-PyUnicode_AS_UNICODE(v);
2408 if (unicode_decode_call_errorhandler(
2409 errors, &errorHandler,
2410 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002411 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002412 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002413 goto onError;
2414 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002415 break;
2416
2417 /* \N{name} */
2418 case 'N':
2419 message = "malformed \\N character escape";
2420 if (ucnhash_CAPI == NULL) {
2421 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002422 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002423 m = PyImport_ImportModule("unicodedata");
2424 if (m == NULL)
2425 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002426 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002427 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002428 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002429 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002430 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002431 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002432 if (ucnhash_CAPI == NULL)
2433 goto ucnhashError;
2434 }
2435 if (*s == '{') {
2436 const char *start = s+1;
2437 /* look for the closing brace */
2438 while (*s != '}' && s < end)
2439 s++;
2440 if (s > start && s < end && *s == '}') {
2441 /* found a name. look it up in the unicode database */
2442 message = "unknown Unicode character name";
2443 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002444 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002445 goto store;
2446 }
2447 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002448 endinpos = s-starts;
2449 outpos = p-PyUnicode_AS_UNICODE(v);
2450 if (unicode_decode_call_errorhandler(
2451 errors, &errorHandler,
2452 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002453 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002454 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002455 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002456 break;
2457
2458 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002459 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002460 message = "\\ at end of string";
2461 s--;
2462 endinpos = s-starts;
2463 outpos = p-PyUnicode_AS_UNICODE(v);
2464 if (unicode_decode_call_errorhandler(
2465 errors, &errorHandler,
2466 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002467 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002469 goto onError;
2470 }
2471 else {
2472 *p++ = '\\';
2473 *p++ = (unsigned char)s[-1];
2474 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002475 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002477 nextByte:
2478 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002480 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002485
Fredrik Lundhccc74732001-02-18 22:13:49 +00002486ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002487 PyErr_SetString(
2488 PyExc_UnicodeError,
2489 "\\N escapes not supported (can't load unicodedata module)"
2490 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002491 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002492 Py_XDECREF(errorHandler);
2493 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002494 return NULL;
2495
Fredrik Lundhccc74732001-02-18 22:13:49 +00002496onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 Py_XDECREF(errorHandler);
2499 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 return NULL;
2501}
2502
2503/* Return a Unicode-Escape string version of the Unicode object.
2504
2505 If quotes is true, the string is enclosed in u"" or u'' quotes as
2506 appropriate.
2507
2508*/
2509
Thomas Wouters477c8d52006-05-27 19:21:47 +00002510Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2511 Py_ssize_t size,
2512 Py_UNICODE ch)
2513{
2514 /* like wcschr, but doesn't stop at NULL characters */
2515
2516 while (size-- > 0) {
2517 if (*s == ch)
2518 return s;
2519 s++;
2520 }
2521
2522 return NULL;
2523}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002524
Walter Dörwald79e913e2007-05-12 11:08:06 +00002525static const char *hexdigits = "0123456789abcdef";
2526
2527PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2528 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529{
2530 PyObject *repr;
2531 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532
Thomas Wouters89f507f2006-12-13 04:49:30 +00002533 /* XXX(nnorwitz): rather than over-allocating, it would be
2534 better to choose a different scheme. Perhaps scan the
2535 first N-chars of the string and allocate based on that size.
2536 */
2537 /* Initial allocation is based on the longest-possible unichr
2538 escape.
2539
2540 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2541 unichr, so in this case it's the longest unichr escape. In
2542 narrow (UTF-16) builds this is five chars per source unichr
2543 since there are two unichrs in the surrogate pair, so in narrow
2544 (UTF-16) builds it's not the longest unichr escape.
2545
2546 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2547 so in the narrow (UTF-16) build case it's the longest unichr
2548 escape.
2549 */
2550
Walter Dörwald79e913e2007-05-12 11:08:06 +00002551 repr = PyBytes_FromStringAndSize(NULL,
Thomas Wouters89f507f2006-12-13 04:49:30 +00002552#ifdef Py_UNICODE_WIDE
2553 + 10*size
2554#else
2555 + 6*size
2556#endif
2557 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 if (repr == NULL)
2559 return NULL;
2560
Walter Dörwald79e913e2007-05-12 11:08:06 +00002561 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 while (size-- > 0) {
2564 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002565
Walter Dörwald79e913e2007-05-12 11:08:06 +00002566 /* Escape backslashes */
2567 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 *p++ = '\\';
2569 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00002570 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002571 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002572
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002573#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002574 /* Map 21-bit characters to '\U00xxxxxx' */
2575 else if (ch >= 0x10000) {
2576 *p++ = '\\';
2577 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002578 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2579 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2580 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2581 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2582 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2583 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2584 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2585 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002586 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002587 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002588#else
2589 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002590 else if (ch >= 0xD800 && ch < 0xDC00) {
2591 Py_UNICODE ch2;
2592 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002593
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002594 ch2 = *s++;
2595 size--;
2596 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2597 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2598 *p++ = '\\';
2599 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002600 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2601 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2602 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2603 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2604 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2605 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2606 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2607 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002608 continue;
2609 }
2610 /* Fall through: isolated surrogates are copied as-is */
2611 s--;
2612 size++;
2613 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00002614#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002615
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002617 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 *p++ = '\\';
2619 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002620 *p++ = hexdigits[(ch >> 12) & 0x000F];
2621 *p++ = hexdigits[(ch >> 8) & 0x000F];
2622 *p++ = hexdigits[(ch >> 4) & 0x000F];
2623 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002625
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002626 /* Map special whitespace to '\t', \n', '\r' */
2627 else if (ch == '\t') {
2628 *p++ = '\\';
2629 *p++ = 't';
2630 }
2631 else if (ch == '\n') {
2632 *p++ = '\\';
2633 *p++ = 'n';
2634 }
2635 else if (ch == '\r') {
2636 *p++ = '\\';
2637 *p++ = 'r';
2638 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002639
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002640 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002641 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002643 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002644 *p++ = hexdigits[(ch >> 4) & 0x000F];
2645 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002646 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002647
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648 /* Copy everything else as-is */
2649 else
2650 *p++ = (char) ch;
2651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652
2653 *p = '\0';
Walter Dörwald79e913e2007-05-12 11:08:06 +00002654 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2655 Py_DECREF(repr);
2656 return NULL;
2657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658 return repr;
2659}
2660
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2662{
Walter Dörwald79e913e2007-05-12 11:08:06 +00002663 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664 if (!PyUnicode_Check(unicode)) {
2665 PyErr_BadArgument();
2666 return NULL;
2667 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00002668 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2669 PyUnicode_GET_SIZE(unicode));
2670
2671 if (!s)
2672 return NULL;
2673 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2674 PyBytes_GET_SIZE(s));
2675 Py_DECREF(s);
2676 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677}
2678
2679/* --- Raw Unicode Escape Codec ------------------------------------------- */
2680
2681PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002682 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 const char *errors)
2684{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002685 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002686 Py_ssize_t startinpos;
2687 Py_ssize_t endinpos;
2688 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002690 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 const char *end;
2692 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 PyObject *errorHandler = NULL;
2694 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002695
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 /* Escaped strings will always be longer than the resulting
2697 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002698 length after conversion to the true value. (But decoding error
2699 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700 v = _PyUnicode_New(size);
2701 if (v == NULL)
2702 goto onError;
2703 if (size == 0)
2704 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 end = s + size;
2707 while (s < end) {
2708 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002709 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002711 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712
2713 /* Non-escape characters are interpreted as Unicode ordinals */
2714 if (*s != '\\') {
2715 *p++ = (unsigned char)*s++;
2716 continue;
2717 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719
2720 /* \u-escapes are only interpreted iff the number of leading
2721 backslashes if odd */
2722 bs = s;
2723 for (;s < end;) {
2724 if (*s != '\\')
2725 break;
2726 *p++ = (unsigned char)*s++;
2727 }
2728 if (((s - bs) & 1) == 0 ||
2729 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002730 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 continue;
2732 }
2733 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002734 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 s++;
2736
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002737 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002739 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742 endinpos = s-starts;
2743 if (unicode_decode_call_errorhandler(
2744 errors, &errorHandler,
2745 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002746 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
2751 x = (x<<4) & ~0xF;
2752 if (c >= '0' && c <= '9')
2753 x += c - '0';
2754 else if (c >= 'a' && c <= 'f')
2755 x += 10 + c - 'a';
2756 else
2757 x += 10 + c - 'A';
2758 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002759#ifndef Py_UNICODE_WIDE
2760 if (x > 0x10000) {
2761 if (unicode_decode_call_errorhandler(
2762 errors, &errorHandler,
2763 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00002764 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002765 (PyObject **)&v, &outpos, &p))
2766 goto onError;
2767 }
2768#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 *p++ = x;
2770 nextByte:
2771 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002773 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002774 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002775 Py_XDECREF(errorHandler);
2776 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002778
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 onError:
2780 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002781 Py_XDECREF(errorHandler);
2782 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 return NULL;
2784}
2785
2786PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002787 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788{
2789 PyObject *repr;
2790 char *p;
2791 char *q;
2792
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002793#ifdef Py_UNICODE_WIDE
Walter Dörwald711005d2007-05-12 12:03:26 +00002794 repr = PyBytes_FromStringAndSize(NULL, 10 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002795#else
Walter Dörwald711005d2007-05-12 12:03:26 +00002796 repr = PyBytes_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002797#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 if (repr == NULL)
2799 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002800 if (size == 0)
2801 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802
Walter Dörwald711005d2007-05-12 12:03:26 +00002803 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 while (size-- > 0) {
2805 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002806#ifdef Py_UNICODE_WIDE
2807 /* Map 32-bit characters to '\Uxxxxxxxx' */
2808 if (ch >= 0x10000) {
2809 *p++ = '\\';
2810 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002811 *p++ = hexdigits[(ch >> 28) & 0xf];
2812 *p++ = hexdigits[(ch >> 24) & 0xf];
2813 *p++ = hexdigits[(ch >> 20) & 0xf];
2814 *p++ = hexdigits[(ch >> 16) & 0xf];
2815 *p++ = hexdigits[(ch >> 12) & 0xf];
2816 *p++ = hexdigits[(ch >> 8) & 0xf];
2817 *p++ = hexdigits[(ch >> 4) & 0xf];
2818 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002819 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002820 else
2821#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 /* Map 16-bit characters to '\uxxxx' */
2823 if (ch >= 256) {
2824 *p++ = '\\';
2825 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00002826 *p++ = hexdigits[(ch >> 12) & 0xf];
2827 *p++ = hexdigits[(ch >> 8) & 0xf];
2828 *p++ = hexdigits[(ch >> 4) & 0xf];
2829 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 }
2831 /* Copy everything else as-is */
2832 else
2833 *p++ = (char) ch;
2834 }
2835 *p = '\0';
Walter Dörwald711005d2007-05-12 12:03:26 +00002836 if (PyBytes_Resize(repr, p - q)) {
2837 Py_DECREF(repr);
2838 return NULL;
2839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 return repr;
2841}
2842
2843PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2844{
Walter Dörwald711005d2007-05-12 12:03:26 +00002845 PyObject *s, *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00002847 PyErr_BadArgument();
2848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 }
Walter Dörwald711005d2007-05-12 12:03:26 +00002850 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2851 PyUnicode_GET_SIZE(unicode));
2852
2853 if (!s)
2854 return NULL;
2855 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2856 PyBytes_GET_SIZE(s));
2857 Py_DECREF(s);
2858 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859}
2860
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002861/* --- Unicode Internal Codec ------------------------------------------- */
2862
2863PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002865 const char *errors)
2866{
2867 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002868 Py_ssize_t startinpos;
2869 Py_ssize_t endinpos;
2870 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002871 PyUnicodeObject *v;
2872 Py_UNICODE *p;
2873 const char *end;
2874 const char *reason;
2875 PyObject *errorHandler = NULL;
2876 PyObject *exc = NULL;
2877
Neal Norwitzd43069c2006-01-08 01:12:10 +00002878#ifdef Py_UNICODE_WIDE
2879 Py_UNICODE unimax = PyUnicode_GetMax();
2880#endif
2881
Thomas Wouters89f507f2006-12-13 04:49:30 +00002882 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002883 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2884 if (v == NULL)
2885 goto onError;
2886 if (PyUnicode_GetSize((PyObject *)v) == 0)
2887 return (PyObject *)v;
2888 p = PyUnicode_AS_UNICODE(v);
2889 end = s + size;
2890
2891 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00002892 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002893 /* We have to sanity check the raw data, otherwise doom looms for
2894 some malformed UCS-4 data. */
2895 if (
2896 #ifdef Py_UNICODE_WIDE
2897 *p > unimax || *p < 0 ||
2898 #endif
2899 end-s < Py_UNICODE_SIZE
2900 )
2901 {
2902 startinpos = s - starts;
2903 if (end-s < Py_UNICODE_SIZE) {
2904 endinpos = end-starts;
2905 reason = "truncated input";
2906 }
2907 else {
2908 endinpos = s - starts + Py_UNICODE_SIZE;
2909 reason = "illegal code point (> 0x10FFFF)";
2910 }
2911 outpos = p - PyUnicode_AS_UNICODE(v);
2912 if (unicode_decode_call_errorhandler(
2913 errors, &errorHandler,
2914 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002915 &starts, &end, &startinpos, &endinpos, &exc, &s,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002916 (PyObject **)&v, &outpos, &p)) {
2917 goto onError;
2918 }
2919 }
2920 else {
2921 p++;
2922 s += Py_UNICODE_SIZE;
2923 }
2924 }
2925
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002926 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002927 goto onError;
2928 Py_XDECREF(errorHandler);
2929 Py_XDECREF(exc);
2930 return (PyObject *)v;
2931
2932 onError:
2933 Py_XDECREF(v);
2934 Py_XDECREF(errorHandler);
2935 Py_XDECREF(exc);
2936 return NULL;
2937}
2938
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939/* --- Latin-1 Codec ------------------------------------------------------ */
2940
2941PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002942 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 const char *errors)
2944{
2945 PyUnicodeObject *v;
2946 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002947
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002949 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002950 Py_UNICODE r = *(unsigned char*)s;
2951 return PyUnicode_FromUnicode(&r, 1);
2952 }
2953
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954 v = _PyUnicode_New(size);
2955 if (v == NULL)
2956 goto onError;
2957 if (size == 0)
2958 return (PyObject *)v;
2959 p = PyUnicode_AS_UNICODE(v);
2960 while (size-- > 0)
2961 *p++ = (unsigned char)*s++;
2962 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002963
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 onError:
2965 Py_XDECREF(v);
2966 return NULL;
2967}
2968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002969/* create or adjust a UnicodeEncodeError */
2970static void make_encode_exception(PyObject **exceptionObject,
2971 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002972 const Py_UNICODE *unicode, Py_ssize_t size,
2973 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002974 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002976 if (*exceptionObject == NULL) {
2977 *exceptionObject = PyUnicodeEncodeError_Create(
2978 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 }
2980 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002981 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2982 goto onError;
2983 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2984 goto onError;
2985 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2986 goto onError;
2987 return;
2988 onError:
2989 Py_DECREF(*exceptionObject);
2990 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 }
2992}
2993
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994/* raises a UnicodeEncodeError */
2995static void raise_encode_exception(PyObject **exceptionObject,
2996 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002997 const Py_UNICODE *unicode, Py_ssize_t size,
2998 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002999 const char *reason)
3000{
3001 make_encode_exception(exceptionObject,
3002 encoding, unicode, size, startpos, endpos, reason);
3003 if (*exceptionObject != NULL)
3004 PyCodec_StrictErrors(*exceptionObject);
3005}
3006
3007/* error handling callback helper:
3008 build arguments, call the callback and check the arguments,
3009 put the result into newpos and return the replacement string, which
3010 has to be freed by the caller */
3011static PyObject *unicode_encode_call_errorhandler(const char *errors,
3012 PyObject **errorHandler,
3013 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003014 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3015 Py_ssize_t startpos, Py_ssize_t endpos,
3016 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003017{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003018 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003019
3020 PyObject *restuple;
3021 PyObject *resunicode;
3022
3023 if (*errorHandler == NULL) {
3024 *errorHandler = PyCodec_LookupError(errors);
3025 if (*errorHandler == NULL)
3026 return NULL;
3027 }
3028
3029 make_encode_exception(exceptionObject,
3030 encoding, unicode, size, startpos, endpos, reason);
3031 if (*exceptionObject == NULL)
3032 return NULL;
3033
3034 restuple = PyObject_CallFunctionObjArgs(
3035 *errorHandler, *exceptionObject, NULL);
3036 if (restuple == NULL)
3037 return NULL;
3038 if (!PyTuple_Check(restuple)) {
3039 PyErr_Format(PyExc_TypeError, &argparse[4]);
3040 Py_DECREF(restuple);
3041 return NULL;
3042 }
3043 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3044 &resunicode, newpos)) {
3045 Py_DECREF(restuple);
3046 return NULL;
3047 }
3048 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003049 *newpos = size+*newpos;
3050 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003051 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003052 Py_DECREF(restuple);
3053 return NULL;
3054 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 Py_INCREF(resunicode);
3056 Py_DECREF(restuple);
3057 return resunicode;
3058}
3059
3060static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003061 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003062 const char *errors,
3063 int limit)
3064{
3065 /* output object */
3066 PyObject *res;
3067 /* pointers to the beginning and end+1 of input */
3068 const Py_UNICODE *startp = p;
3069 const Py_UNICODE *endp = p + size;
3070 /* pointer to the beginning of the unencodable characters */
3071 /* const Py_UNICODE *badp = NULL; */
3072 /* pointer into the output */
3073 char *str;
3074 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003075 Py_ssize_t respos = 0;
3076 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003077 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3078 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003079 PyObject *errorHandler = NULL;
3080 PyObject *exc = NULL;
3081 /* the following variable is used for caching string comparisons
3082 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3083 int known_errorHandler = -1;
3084
3085 /* allocate enough for a simple encoding without
3086 replacements, if we need more, we'll resize */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003087 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 if (res == NULL)
3089 goto onError;
3090 if (size == 0)
3091 return res;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003092 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 ressize = size;
3094
3095 while (p<endp) {
3096 Py_UNICODE c = *p;
3097
3098 /* can we encode this? */
3099 if (c<limit) {
3100 /* no overflow check, because we know that the space is enough */
3101 *str++ = (char)c;
3102 ++p;
3103 }
3104 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003105 Py_ssize_t unicodepos = p-startp;
3106 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003107 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003108 Py_ssize_t repsize;
3109 Py_ssize_t newpos;
3110 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111 Py_UNICODE *uni2;
3112 /* startpos for collecting unencodable chars */
3113 const Py_UNICODE *collstart = p;
3114 const Py_UNICODE *collend = p;
3115 /* find all unecodable characters */
3116 while ((collend < endp) && ((*collend)>=limit))
3117 ++collend;
3118 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3119 if (known_errorHandler==-1) {
3120 if ((errors==NULL) || (!strcmp(errors, "strict")))
3121 known_errorHandler = 1;
3122 else if (!strcmp(errors, "replace"))
3123 known_errorHandler = 2;
3124 else if (!strcmp(errors, "ignore"))
3125 known_errorHandler = 3;
3126 else if (!strcmp(errors, "xmlcharrefreplace"))
3127 known_errorHandler = 4;
3128 else
3129 known_errorHandler = 0;
3130 }
3131 switch (known_errorHandler) {
3132 case 1: /* strict */
3133 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3134 goto onError;
3135 case 2: /* replace */
3136 while (collstart++<collend)
3137 *str++ = '?'; /* fall through */
3138 case 3: /* ignore */
3139 p = collend;
3140 break;
3141 case 4: /* xmlcharrefreplace */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003142 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003143 /* determine replacement size (temporarily (mis)uses p) */
3144 for (p = collstart, repsize = 0; p < collend; ++p) {
3145 if (*p<10)
3146 repsize += 2+1+1;
3147 else if (*p<100)
3148 repsize += 2+2+1;
3149 else if (*p<1000)
3150 repsize += 2+3+1;
3151 else if (*p<10000)
3152 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003153#ifndef Py_UNICODE_WIDE
3154 else
3155 repsize += 2+5+1;
3156#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003157 else if (*p<100000)
3158 repsize += 2+5+1;
3159 else if (*p<1000000)
3160 repsize += 2+6+1;
3161 else
3162 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003163#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 }
3165 requiredsize = respos+repsize+(endp-collend);
3166 if (requiredsize > ressize) {
3167 if (requiredsize<2*ressize)
3168 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003169 if (PyBytes_Resize(res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003171 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172 ressize = requiredsize;
3173 }
3174 /* generate replacement (temporarily (mis)uses p) */
3175 for (p = collstart; p < collend; ++p) {
3176 str += sprintf(str, "&#%d;", (int)*p);
3177 }
3178 p = collend;
3179 break;
3180 default:
3181 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3182 encoding, reason, startp, size, &exc,
3183 collstart-startp, collend-startp, &newpos);
3184 if (repunicode == NULL)
3185 goto onError;
3186 /* need more space? (at least enough for what we
3187 have+the replacement+the rest of the string, so
3188 we won't have to check space for encodable characters) */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003189 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190 repsize = PyUnicode_GET_SIZE(repunicode);
3191 requiredsize = respos+repsize+(endp-collend);
3192 if (requiredsize > ressize) {
3193 if (requiredsize<2*ressize)
3194 requiredsize = 2*ressize;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003195 if (PyBytes_Resize(res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 Py_DECREF(repunicode);
3197 goto onError;
3198 }
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003199 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003200 ressize = requiredsize;
3201 }
3202 /* check if there is anything unencodable in the replacement
3203 and copy it to the output */
3204 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3205 c = *uni2;
3206 if (c >= limit) {
3207 raise_encode_exception(&exc, encoding, startp, size,
3208 unicodepos, unicodepos+1, reason);
3209 Py_DECREF(repunicode);
3210 goto onError;
3211 }
3212 *str = (char)c;
3213 }
3214 p = startp + newpos;
3215 Py_DECREF(repunicode);
3216 }
3217 }
3218 }
3219 /* Resize if we allocated to much */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003220 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 if (respos<ressize)
3222 /* If this falls res will be NULL */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003223 PyBytes_Resize(res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 Py_XDECREF(errorHandler);
3225 Py_XDECREF(exc);
3226 return res;
3227
3228 onError:
3229 Py_XDECREF(res);
3230 Py_XDECREF(errorHandler);
3231 Py_XDECREF(exc);
3232 return NULL;
3233}
3234
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003236 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 const char *errors)
3238{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240}
3241
3242PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3243{
3244 if (!PyUnicode_Check(unicode)) {
3245 PyErr_BadArgument();
3246 return NULL;
3247 }
3248 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3249 PyUnicode_GET_SIZE(unicode),
3250 NULL);
3251}
3252
3253/* --- 7-bit ASCII Codec -------------------------------------------------- */
3254
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003256 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 const char *errors)
3258{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 PyUnicodeObject *v;
3261 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003262 Py_ssize_t startinpos;
3263 Py_ssize_t endinpos;
3264 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265 const char *e;
3266 PyObject *errorHandler = NULL;
3267 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003268
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003270 if (size == 1 && *(unsigned char*)s < 128) {
3271 Py_UNICODE r = *(unsigned char*)s;
3272 return PyUnicode_FromUnicode(&r, 1);
3273 }
Tim Petersced69f82003-09-16 20:30:58 +00003274
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 v = _PyUnicode_New(size);
3276 if (v == NULL)
3277 goto onError;
3278 if (size == 0)
3279 return (PyObject *)v;
3280 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 e = s + size;
3282 while (s < e) {
3283 register unsigned char c = (unsigned char)*s;
3284 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286 ++s;
3287 }
3288 else {
3289 startinpos = s-starts;
3290 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003291 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 if (unicode_decode_call_errorhandler(
3293 errors, &errorHandler,
3294 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003295 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00003300 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003301 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003302 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 Py_XDECREF(errorHandler);
3304 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003306
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 onError:
3308 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003309 Py_XDECREF(errorHandler);
3310 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 return NULL;
3312}
3313
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003315 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 const char *errors)
3317{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319}
3320
3321PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3322{
3323 if (!PyUnicode_Check(unicode)) {
3324 PyErr_BadArgument();
3325 return NULL;
3326 }
3327 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3328 PyUnicode_GET_SIZE(unicode),
3329 NULL);
3330}
3331
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003332#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003333
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003334/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003335
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003336#if SIZEOF_INT < SIZEOF_SSIZE_T
3337#define NEED_RETRY
3338#endif
3339
3340/* XXX This code is limited to "true" double-byte encodings, as
3341 a) it assumes an incomplete character consists of a single byte, and
3342 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3343 encodings, see IsDBCSLeadByteEx documentation. */
3344
3345static int is_dbcs_lead_byte(const char *s, int offset)
3346{
3347 const char *curr = s + offset;
3348
3349 if (IsDBCSLeadByte(*curr)) {
3350 const char *prev = CharPrev(s, curr);
3351 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3352 }
3353 return 0;
3354}
3355
3356/*
3357 * Decode MBCS string into unicode object. If 'final' is set, converts
3358 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3359 */
3360static int decode_mbcs(PyUnicodeObject **v,
3361 const char *s, /* MBCS string */
3362 int size, /* sizeof MBCS string */
3363 int final)
3364{
3365 Py_UNICODE *p;
3366 Py_ssize_t n = 0;
3367 int usize = 0;
3368
3369 assert(size >= 0);
3370
3371 /* Skip trailing lead-byte unless 'final' is set */
3372 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3373 --size;
3374
3375 /* First get the size of the result */
3376 if (size > 0) {
3377 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3378 if (usize == 0) {
3379 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3380 return -1;
3381 }
3382 }
3383
3384 if (*v == NULL) {
3385 /* Create unicode object */
3386 *v = _PyUnicode_New(usize);
3387 if (*v == NULL)
3388 return -1;
3389 }
3390 else {
3391 /* Extend unicode object */
3392 n = PyUnicode_GET_SIZE(*v);
3393 if (_PyUnicode_Resize(v, n + usize) < 0)
3394 return -1;
3395 }
3396
3397 /* Do the conversion */
3398 if (size > 0) {
3399 p = PyUnicode_AS_UNICODE(*v) + n;
3400 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3401 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3402 return -1;
3403 }
3404 }
3405
3406 return size;
3407}
3408
3409PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3410 Py_ssize_t size,
3411 const char *errors,
3412 Py_ssize_t *consumed)
3413{
3414 PyUnicodeObject *v = NULL;
3415 int done;
3416
3417 if (consumed)
3418 *consumed = 0;
3419
3420#ifdef NEED_RETRY
3421 retry:
3422 if (size > INT_MAX)
3423 done = decode_mbcs(&v, s, INT_MAX, 0);
3424 else
3425#endif
3426 done = decode_mbcs(&v, s, (int)size, !consumed);
3427
3428 if (done < 0) {
3429 Py_XDECREF(v);
3430 return NULL;
3431 }
3432
3433 if (consumed)
3434 *consumed += done;
3435
3436#ifdef NEED_RETRY
3437 if (size > INT_MAX) {
3438 s += done;
3439 size -= done;
3440 goto retry;
3441 }
3442#endif
3443
3444 return (PyObject *)v;
3445}
3446
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003447PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003448 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003449 const char *errors)
3450{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003451 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3452}
3453
3454/*
3455 * Convert unicode into string object (MBCS).
3456 * Returns 0 if succeed, -1 otherwise.
3457 */
3458static int encode_mbcs(PyObject **repr,
3459 const Py_UNICODE *p, /* unicode */
3460 int size) /* size of unicode */
3461{
3462 int mbcssize = 0;
3463 Py_ssize_t n = 0;
3464
3465 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003466
3467 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003468 if (size > 0) {
3469 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3470 if (mbcssize == 0) {
3471 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3472 return -1;
3473 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003474 }
3475
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003476 if (*repr == NULL) {
3477 /* Create string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003478 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003479 if (*repr == NULL)
3480 return -1;
3481 }
3482 else {
3483 /* Extend string object */
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003484 n = PyBytes_Size(*repr);
3485 if (PyBytes_Resize(*repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003486 return -1;
3487 }
3488
3489 /* Do the conversion */
3490 if (size > 0) {
Guido van Rossumf15a29f2007-05-04 00:41:39 +00003491 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003492 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3493 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3494 return -1;
3495 }
3496 }
3497
3498 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003499}
3500
3501PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003502 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003503 const char *errors)
3504{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003505 PyObject *repr = NULL;
3506 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003507
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003508#ifdef NEED_RETRY
3509 retry:
3510 if (size > INT_MAX)
3511 ret = encode_mbcs(&repr, p, INT_MAX);
3512 else
3513#endif
3514 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003515
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003516 if (ret < 0) {
3517 Py_XDECREF(repr);
3518 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003519 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003520
3521#ifdef NEED_RETRY
3522 if (size > INT_MAX) {
3523 p += INT_MAX;
3524 size -= INT_MAX;
3525 goto retry;
3526 }
3527#endif
3528
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003529 return repr;
3530}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003531
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003532PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3533{
3534 if (!PyUnicode_Check(unicode)) {
3535 PyErr_BadArgument();
3536 return NULL;
3537 }
3538 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3539 PyUnicode_GET_SIZE(unicode),
3540 NULL);
3541}
3542
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003543#undef NEED_RETRY
3544
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003545#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003546
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547/* --- Character Mapping Codec -------------------------------------------- */
3548
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003550 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 PyObject *mapping,
3552 const char *errors)
3553{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003555 Py_ssize_t startinpos;
3556 Py_ssize_t endinpos;
3557 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 PyUnicodeObject *v;
3560 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003561 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 PyObject *errorHandler = NULL;
3563 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003564 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003565 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003566
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 /* Default to Latin-1 */
3568 if (mapping == NULL)
3569 return PyUnicode_DecodeLatin1(s, size, errors);
3570
3571 v = _PyUnicode_New(size);
3572 if (v == NULL)
3573 goto onError;
3574 if (size == 0)
3575 return (PyObject *)v;
3576 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003578 if (PyUnicode_CheckExact(mapping)) {
3579 mapstring = PyUnicode_AS_UNICODE(mapping);
3580 maplen = PyUnicode_GET_SIZE(mapping);
3581 while (s < e) {
3582 unsigned char ch = *s;
3583 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003585 if (ch < maplen)
3586 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003588 if (x == 0xfffe) {
3589 /* undefined mapping */
3590 outpos = p-PyUnicode_AS_UNICODE(v);
3591 startinpos = s-starts;
3592 endinpos = startinpos+1;
3593 if (unicode_decode_call_errorhandler(
3594 errors, &errorHandler,
3595 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003596 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003597 (PyObject **)&v, &outpos, &p)) {
3598 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003599 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003600 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003601 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003602 *p++ = x;
3603 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003605 }
3606 else {
3607 while (s < e) {
3608 unsigned char ch = *s;
3609 PyObject *w, *x;
3610
3611 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3612 w = PyInt_FromLong((long)ch);
3613 if (w == NULL)
3614 goto onError;
3615 x = PyObject_GetItem(mapping, w);
3616 Py_DECREF(w);
3617 if (x == NULL) {
3618 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3619 /* No mapping found means: mapping is undefined. */
3620 PyErr_Clear();
3621 x = Py_None;
3622 Py_INCREF(x);
3623 } else
3624 goto onError;
3625 }
3626
3627 /* Apply mapping */
3628 if (PyInt_Check(x)) {
3629 long value = PyInt_AS_LONG(x);
3630 if (value < 0 || value > 65535) {
3631 PyErr_SetString(PyExc_TypeError,
3632 "character mapping must be in range(65536)");
3633 Py_DECREF(x);
3634 goto onError;
3635 }
3636 *p++ = (Py_UNICODE)value;
3637 }
3638 else if (x == Py_None) {
3639 /* undefined mapping */
3640 outpos = p-PyUnicode_AS_UNICODE(v);
3641 startinpos = s-starts;
3642 endinpos = startinpos+1;
3643 if (unicode_decode_call_errorhandler(
3644 errors, &errorHandler,
3645 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003646 &starts, &e, &startinpos, &endinpos, &exc, &s,
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003647 (PyObject **)&v, &outpos, &p)) {
3648 Py_DECREF(x);
3649 goto onError;
3650 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003651 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003652 continue;
3653 }
3654 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003655 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003656
3657 if (targetsize == 1)
3658 /* 1-1 mapping */
3659 *p++ = *PyUnicode_AS_UNICODE(x);
3660
3661 else if (targetsize > 1) {
3662 /* 1-n mapping */
3663 if (targetsize > extrachars) {
3664 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003665 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3666 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003667 (targetsize << 2);
3668 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00003669 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003670 if (_PyUnicode_Resize(&v,
3671 PyUnicode_GET_SIZE(v) + needed) < 0) {
3672 Py_DECREF(x);
3673 goto onError;
3674 }
3675 p = PyUnicode_AS_UNICODE(v) + oldpos;
3676 }
3677 Py_UNICODE_COPY(p,
3678 PyUnicode_AS_UNICODE(x),
3679 targetsize);
3680 p += targetsize;
3681 extrachars -= targetsize;
3682 }
3683 /* 1-0 mapping: skip the character */
3684 }
3685 else {
3686 /* wrong return value */
3687 PyErr_SetString(PyExc_TypeError,
3688 "character mapping must return integer, None or unicode");
3689 Py_DECREF(x);
3690 goto onError;
3691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003693 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 }
3696 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003697 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 Py_XDECREF(errorHandler);
3700 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003704 Py_XDECREF(errorHandler);
3705 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 Py_XDECREF(v);
3707 return NULL;
3708}
3709
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003710/* Charmap encoding: the lookup table */
3711
3712struct encoding_map{
3713 PyObject_HEAD
3714 unsigned char level1[32];
3715 int count2, count3;
3716 unsigned char level23[1];
3717};
3718
3719static PyObject*
3720encoding_map_size(PyObject *obj, PyObject* args)
3721{
3722 struct encoding_map *map = (struct encoding_map*)obj;
3723 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3724 128*map->count3);
3725}
3726
3727static PyMethodDef encoding_map_methods[] = {
3728 {"size", encoding_map_size, METH_NOARGS,
3729 PyDoc_STR("Return the size (in bytes) of this object") },
3730 { 0 }
3731};
3732
3733static void
3734encoding_map_dealloc(PyObject* o)
3735{
3736 PyObject_FREE(o);
3737}
3738
3739static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00003740 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003741 "EncodingMap", /*tp_name*/
3742 sizeof(struct encoding_map), /*tp_basicsize*/
3743 0, /*tp_itemsize*/
3744 /* methods */
3745 encoding_map_dealloc, /*tp_dealloc*/
3746 0, /*tp_print*/
3747 0, /*tp_getattr*/
3748 0, /*tp_setattr*/
3749 0, /*tp_compare*/
3750 0, /*tp_repr*/
3751 0, /*tp_as_number*/
3752 0, /*tp_as_sequence*/
3753 0, /*tp_as_mapping*/
3754 0, /*tp_hash*/
3755 0, /*tp_call*/
3756 0, /*tp_str*/
3757 0, /*tp_getattro*/
3758 0, /*tp_setattro*/
3759 0, /*tp_as_buffer*/
3760 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3761 0, /*tp_doc*/
3762 0, /*tp_traverse*/
3763 0, /*tp_clear*/
3764 0, /*tp_richcompare*/
3765 0, /*tp_weaklistoffset*/
3766 0, /*tp_iter*/
3767 0, /*tp_iternext*/
3768 encoding_map_methods, /*tp_methods*/
3769 0, /*tp_members*/
3770 0, /*tp_getset*/
3771 0, /*tp_base*/
3772 0, /*tp_dict*/
3773 0, /*tp_descr_get*/
3774 0, /*tp_descr_set*/
3775 0, /*tp_dictoffset*/
3776 0, /*tp_init*/
3777 0, /*tp_alloc*/
3778 0, /*tp_new*/
3779 0, /*tp_free*/
3780 0, /*tp_is_gc*/
3781};
3782
3783PyObject*
3784PyUnicode_BuildEncodingMap(PyObject* string)
3785{
3786 Py_UNICODE *decode;
3787 PyObject *result;
3788 struct encoding_map *mresult;
3789 int i;
3790 int need_dict = 0;
3791 unsigned char level1[32];
3792 unsigned char level2[512];
3793 unsigned char *mlevel1, *mlevel2, *mlevel3;
3794 int count2 = 0, count3 = 0;
3795
3796 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3797 PyErr_BadArgument();
3798 return NULL;
3799 }
3800 decode = PyUnicode_AS_UNICODE(string);
3801 memset(level1, 0xFF, sizeof level1);
3802 memset(level2, 0xFF, sizeof level2);
3803
3804 /* If there isn't a one-to-one mapping of NULL to \0,
3805 or if there are non-BMP characters, we need to use
3806 a mapping dictionary. */
3807 if (decode[0] != 0)
3808 need_dict = 1;
3809 for (i = 1; i < 256; i++) {
3810 int l1, l2;
3811 if (decode[i] == 0
3812 #ifdef Py_UNICODE_WIDE
3813 || decode[i] > 0xFFFF
3814 #endif
3815 ) {
3816 need_dict = 1;
3817 break;
3818 }
3819 if (decode[i] == 0xFFFE)
3820 /* unmapped character */
3821 continue;
3822 l1 = decode[i] >> 11;
3823 l2 = decode[i] >> 7;
3824 if (level1[l1] == 0xFF)
3825 level1[l1] = count2++;
3826 if (level2[l2] == 0xFF)
3827 level2[l2] = count3++;
3828 }
3829
3830 if (count2 >= 0xFF || count3 >= 0xFF)
3831 need_dict = 1;
3832
3833 if (need_dict) {
3834 PyObject *result = PyDict_New();
3835 PyObject *key, *value;
3836 if (!result)
3837 return NULL;
3838 for (i = 0; i < 256; i++) {
3839 key = value = NULL;
3840 key = PyInt_FromLong(decode[i]);
3841 value = PyInt_FromLong(i);
3842 if (!key || !value)
3843 goto failed1;
3844 if (PyDict_SetItem(result, key, value) == -1)
3845 goto failed1;
3846 Py_DECREF(key);
3847 Py_DECREF(value);
3848 }
3849 return result;
3850 failed1:
3851 Py_XDECREF(key);
3852 Py_XDECREF(value);
3853 Py_DECREF(result);
3854 return NULL;
3855 }
3856
3857 /* Create a three-level trie */
3858 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3859 16*count2 + 128*count3 - 1);
3860 if (!result)
3861 return PyErr_NoMemory();
3862 PyObject_Init(result, &EncodingMapType);
3863 mresult = (struct encoding_map*)result;
3864 mresult->count2 = count2;
3865 mresult->count3 = count3;
3866 mlevel1 = mresult->level1;
3867 mlevel2 = mresult->level23;
3868 mlevel3 = mresult->level23 + 16*count2;
3869 memcpy(mlevel1, level1, 32);
3870 memset(mlevel2, 0xFF, 16*count2);
3871 memset(mlevel3, 0, 128*count3);
3872 count3 = 0;
3873 for (i = 1; i < 256; i++) {
3874 int o1, o2, o3, i2, i3;
3875 if (decode[i] == 0xFFFE)
3876 /* unmapped character */
3877 continue;
3878 o1 = decode[i]>>11;
3879 o2 = (decode[i]>>7) & 0xF;
3880 i2 = 16*mlevel1[o1] + o2;
3881 if (mlevel2[i2] == 0xFF)
3882 mlevel2[i2] = count3++;
3883 o3 = decode[i] & 0x7F;
3884 i3 = 128*mlevel2[i2] + o3;
3885 mlevel3[i3] = i;
3886 }
3887 return result;
3888}
3889
3890static int
3891encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3892{
3893 struct encoding_map *map = (struct encoding_map*)mapping;
3894 int l1 = c>>11;
3895 int l2 = (c>>7) & 0xF;
3896 int l3 = c & 0x7F;
3897 int i;
3898
3899#ifdef Py_UNICODE_WIDE
3900 if (c > 0xFFFF) {
3901 return -1;
3902 }
3903#endif
3904 if (c == 0)
3905 return 0;
3906 /* level 1*/
3907 i = map->level1[l1];
3908 if (i == 0xFF) {
3909 return -1;
3910 }
3911 /* level 2*/
3912 i = map->level23[16*i+l2];
3913 if (i == 0xFF) {
3914 return -1;
3915 }
3916 /* level 3 */
3917 i = map->level23[16*map->count2 + 128*i + l3];
3918 if (i == 0) {
3919 return -1;
3920 }
3921 return i;
3922}
3923
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924/* Lookup the character ch in the mapping. If the character
3925 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003926 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 PyObject *w = PyInt_FromLong((long)c);
3930 PyObject *x;
3931
3932 if (w == NULL)
3933 return NULL;
3934 x = PyObject_GetItem(mapping, w);
3935 Py_DECREF(w);
3936 if (x == NULL) {
3937 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3938 /* No mapping found means: mapping is undefined. */
3939 PyErr_Clear();
3940 x = Py_None;
3941 Py_INCREF(x);
3942 return x;
3943 } else
3944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003946 else if (x == Py_None)
3947 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 else if (PyInt_Check(x)) {
3949 long value = PyInt_AS_LONG(x);
3950 if (value < 0 || value > 255) {
3951 PyErr_SetString(PyExc_TypeError,
3952 "character mapping must be in range(256)");
3953 Py_DECREF(x);
3954 return NULL;
3955 }
3956 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 else if (PyString_Check(x))
3959 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00003962 PyErr_Format(PyExc_TypeError,
3963 "character mapping must return integer, None or str8, not %.400s",
3964 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 Py_DECREF(x);
3966 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 }
3968}
3969
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003970static int
Walter Dörwald827b0552007-05-12 13:23:53 +00003971charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003972{
Walter Dörwald827b0552007-05-12 13:23:53 +00003973 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003974 /* exponentially overallocate to minimize reallocations */
3975 if (requiredsize < 2*outsize)
3976 requiredsize = 2*outsize;
Walter Dörwald827b0552007-05-12 13:23:53 +00003977 if (PyBytes_Resize(outobj, requiredsize)) {
3978 Py_DECREF(outobj);
3979 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003980 }
Walter Dörwald827b0552007-05-12 13:23:53 +00003981 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003982}
3983
3984typedef enum charmapencode_result {
3985 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3986}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00003988 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003989 space is available. Return a new reference to the object that
3990 was put in the output buffer, or Py_None, if the mapping was undefined
3991 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003992 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003994charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Walter Dörwald827b0552007-05-12 13:23:53 +00003995 PyObject *outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00003997 PyObject *rep;
3998 char *outstart;
Walter Dörwald827b0552007-05-12 13:23:53 +00003999 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004001 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004002 int res = encoding_map_lookup(c, mapping);
4003 Py_ssize_t requiredsize = *outpos+1;
4004 if (res == -1)
4005 return enc_FAILED;
4006 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004007 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004008 return enc_EXCEPTION;
Walter Dörwald827b0552007-05-12 13:23:53 +00004009 outstart = PyBytes_AS_STRING(outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004010 outstart[(*outpos)++] = (char)res;
4011 return enc_SUCCESS;
4012 }
4013
4014 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004016 return enc_EXCEPTION;
4017 else if (rep==Py_None) {
4018 Py_DECREF(rep);
4019 return enc_FAILED;
4020 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004022 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004023 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004024 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004026 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004028 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4030 }
4031 else {
4032 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004033 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4034 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004035 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004036 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004038 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 }
Walter Dörwald827b0552007-05-12 13:23:53 +00004040 outstart = PyBytes_AS_STRING(outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 memcpy(outstart + *outpos, repchars, repsize);
4042 *outpos += repsize;
4043 }
4044 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004045 Py_DECREF(rep);
4046 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047}
4048
4049/* handle an error in PyUnicode_EncodeCharmap
4050 Return 0 on success, -1 on error */
4051static
4052int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004053 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004055 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004056 PyObject *res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057{
4058 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t repsize;
4060 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 Py_UNICODE *uni2;
4062 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004063 Py_ssize_t collstartpos = *inpos;
4064 Py_ssize_t collendpos = *inpos+1;
4065 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 char *encoding = "charmap";
4067 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004068 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070 /* find all unencodable characters */
4071 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004072 PyObject *rep;
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004073 if (Py_Type(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004074 int res = encoding_map_lookup(p[collendpos], mapping);
4075 if (res != -1)
4076 break;
4077 ++collendpos;
4078 continue;
4079 }
4080
4081 rep = charmapencode_lookup(p[collendpos], mapping);
4082 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004084 else if (rep!=Py_None) {
4085 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 break;
4087 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004088 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 ++collendpos;
4090 }
4091 /* cache callback name lookup
4092 * (if not done yet, i.e. it's the first error) */
4093 if (*known_errorHandler==-1) {
4094 if ((errors==NULL) || (!strcmp(errors, "strict")))
4095 *known_errorHandler = 1;
4096 else if (!strcmp(errors, "replace"))
4097 *known_errorHandler = 2;
4098 else if (!strcmp(errors, "ignore"))
4099 *known_errorHandler = 3;
4100 else if (!strcmp(errors, "xmlcharrefreplace"))
4101 *known_errorHandler = 4;
4102 else
4103 *known_errorHandler = 0;
4104 }
4105 switch (*known_errorHandler) {
4106 case 1: /* strict */
4107 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4108 return -1;
4109 case 2: /* replace */
4110 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4111 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004112 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113 return -1;
4114 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004115 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4117 return -1;
4118 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 }
4120 /* fall through */
4121 case 3: /* ignore */
4122 *inpos = collendpos;
4123 break;
4124 case 4: /* xmlcharrefreplace */
4125 /* generate replacement (temporarily (mis)uses p) */
4126 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4127 char buffer[2+29+1+1];
4128 char *cp;
4129 sprintf(buffer, "&#%d;", (int)p[collpos]);
4130 for (cp = buffer; *cp; ++cp) {
4131 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004132 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004133 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004134 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4136 return -1;
4137 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 }
4139 }
4140 *inpos = collendpos;
4141 break;
4142 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004143 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 encoding, reason, p, size, exceptionObject,
4145 collstartpos, collendpos, &newpos);
4146 if (repunicode == NULL)
4147 return -1;
4148 /* generate replacement */
4149 repsize = PyUnicode_GET_SIZE(repunicode);
4150 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4151 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004152 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 return -1;
4154 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004155 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4158 return -1;
4159 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 }
4161 *inpos = newpos;
4162 Py_DECREF(repunicode);
4163 }
4164 return 0;
4165}
4166
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004168 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 PyObject *mapping,
4170 const char *errors)
4171{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 /* output object */
4173 PyObject *res = NULL;
4174 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004175 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004177 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 PyObject *errorHandler = NULL;
4179 PyObject *exc = NULL;
4180 /* the following variable is used for caching string comparisons
4181 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4182 * 3=ignore, 4=xmlcharrefreplace */
4183 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184
4185 /* Default to Latin-1 */
4186 if (mapping == NULL)
4187 return PyUnicode_EncodeLatin1(p, size, errors);
4188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 /* allocate enough for a simple encoding without
4190 replacements, if we need more, we'll resize */
Walter Dörwald827b0552007-05-12 13:23:53 +00004191 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 if (res == NULL)
4193 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004194 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 while (inpos<size) {
4198 /* try to encode it */
Walter Dörwald827b0552007-05-12 13:23:53 +00004199 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004200 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004202 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 if (charmap_encoding_error(p, size, &inpos, mapping,
4204 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004205 &known_errorHandler, &errorHandler, errors,
Walter Dörwald827b0552007-05-12 13:23:53 +00004206 res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00004207 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00004208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 else
4211 /* done with this character => adjust input position */
4212 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 /* Resize if we allocated to much */
Walter Dörwald827b0552007-05-12 13:23:53 +00004216 if (respos<PyBytes_GET_SIZE(res)) {
4217 if (PyBytes_Resize(res, respos))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 goto onError;
4219 }
4220 Py_XDECREF(exc);
4221 Py_XDECREF(errorHandler);
4222 return res;
4223
4224 onError:
4225 Py_XDECREF(res);
4226 Py_XDECREF(exc);
4227 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 return NULL;
4229}
4230
4231PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4232 PyObject *mapping)
4233{
4234 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4235 PyErr_BadArgument();
4236 return NULL;
4237 }
4238 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4239 PyUnicode_GET_SIZE(unicode),
4240 mapping,
4241 NULL);
4242}
4243
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244/* create or adjust a UnicodeTranslateError */
4245static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004246 const Py_UNICODE *unicode, Py_ssize_t size,
4247 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250 if (*exceptionObject == NULL) {
4251 *exceptionObject = PyUnicodeTranslateError_Create(
4252 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 }
4254 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4256 goto onError;
4257 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4258 goto onError;
4259 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4260 goto onError;
4261 return;
4262 onError:
4263 Py_DECREF(*exceptionObject);
4264 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 }
4266}
4267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268/* raises a UnicodeTranslateError */
4269static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004270 const Py_UNICODE *unicode, Py_ssize_t size,
4271 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 const char *reason)
4273{
4274 make_translate_exception(exceptionObject,
4275 unicode, size, startpos, endpos, reason);
4276 if (*exceptionObject != NULL)
4277 PyCodec_StrictErrors(*exceptionObject);
4278}
4279
4280/* error handling callback helper:
4281 build arguments, call the callback and check the arguments,
4282 put the result into newpos and return the replacement string, which
4283 has to be freed by the caller */
4284static PyObject *unicode_translate_call_errorhandler(const char *errors,
4285 PyObject **errorHandler,
4286 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004287 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4288 Py_ssize_t startpos, Py_ssize_t endpos,
4289 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004291 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004293 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 PyObject *restuple;
4295 PyObject *resunicode;
4296
4297 if (*errorHandler == NULL) {
4298 *errorHandler = PyCodec_LookupError(errors);
4299 if (*errorHandler == NULL)
4300 return NULL;
4301 }
4302
4303 make_translate_exception(exceptionObject,
4304 unicode, size, startpos, endpos, reason);
4305 if (*exceptionObject == NULL)
4306 return NULL;
4307
4308 restuple = PyObject_CallFunctionObjArgs(
4309 *errorHandler, *exceptionObject, NULL);
4310 if (restuple == NULL)
4311 return NULL;
4312 if (!PyTuple_Check(restuple)) {
4313 PyErr_Format(PyExc_TypeError, &argparse[4]);
4314 Py_DECREF(restuple);
4315 return NULL;
4316 }
4317 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004318 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_DECREF(restuple);
4320 return NULL;
4321 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004322 if (i_newpos<0)
4323 *newpos = size+i_newpos;
4324 else
4325 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004326 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004327 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004328 Py_DECREF(restuple);
4329 return NULL;
4330 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 Py_INCREF(resunicode);
4332 Py_DECREF(restuple);
4333 return resunicode;
4334}
4335
4336/* Lookup the character ch in the mapping and put the result in result,
4337 which must be decrefed by the caller.
4338 Return 0 on success, -1 on error */
4339static
4340int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4341{
4342 PyObject *w = PyInt_FromLong((long)c);
4343 PyObject *x;
4344
4345 if (w == NULL)
4346 return -1;
4347 x = PyObject_GetItem(mapping, w);
4348 Py_DECREF(w);
4349 if (x == NULL) {
4350 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4351 /* No mapping found means: use 1:1 mapping. */
4352 PyErr_Clear();
4353 *result = NULL;
4354 return 0;
4355 } else
4356 return -1;
4357 }
4358 else if (x == Py_None) {
4359 *result = x;
4360 return 0;
4361 }
4362 else if (PyInt_Check(x)) {
4363 long value = PyInt_AS_LONG(x);
4364 long max = PyUnicode_GetMax();
4365 if (value < 0 || value > max) {
4366 PyErr_Format(PyExc_TypeError,
4367 "character mapping must be in range(0x%lx)", max+1);
4368 Py_DECREF(x);
4369 return -1;
4370 }
4371 *result = x;
4372 return 0;
4373 }
4374 else if (PyUnicode_Check(x)) {
4375 *result = x;
4376 return 0;
4377 }
4378 else {
4379 /* wrong return value */
4380 PyErr_SetString(PyExc_TypeError,
4381 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004382 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 return -1;
4384 }
4385}
4386/* ensure that *outobj is at least requiredsize characters long,
4387if not reallocate and adjust various state variables.
4388Return 0 on success, -1 on error */
4389static
Walter Dörwald4894c302003-10-24 14:25:28 +00004390int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004391 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004393 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004394 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004396 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004398 if (requiredsize < 2 * oldsize)
4399 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004400 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 return -1;
4402 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 }
4404 return 0;
4405}
4406/* lookup the character, put the result in the output string and adjust
4407 various state variables. Return a new reference to the object that
4408 was put in the output buffer in *result, or Py_None, if the mapping was
4409 undefined (in which case no character was written).
4410 The called must decref result.
4411 Return 0 on success, -1 on error. */
4412static
Walter Dörwald4894c302003-10-24 14:25:28 +00004413int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004414 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004415 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416{
Walter Dörwald4894c302003-10-24 14:25:28 +00004417 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 return -1;
4419 if (*res==NULL) {
4420 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004421 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422 }
4423 else if (*res==Py_None)
4424 ;
4425 else if (PyInt_Check(*res)) {
4426 /* no overflow check, because we know that the space is enough */
4427 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4428 }
4429 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004430 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 if (repsize==1) {
4432 /* no overflow check, because we know that the space is enough */
4433 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4434 }
4435 else if (repsize!=0) {
4436 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004437 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004438 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004439 repsize - 1;
4440 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 return -1;
4442 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4443 *outp += repsize;
4444 }
4445 }
4446 else
4447 return -1;
4448 return 0;
4449}
4450
4451PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004452 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 PyObject *mapping,
4454 const char *errors)
4455{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 /* output object */
4457 PyObject *res = NULL;
4458 /* pointers to the beginning and end+1 of input */
4459 const Py_UNICODE *startp = p;
4460 const Py_UNICODE *endp = p + size;
4461 /* pointer into the output */
4462 Py_UNICODE *str;
4463 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004464 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 char *reason = "character maps to <undefined>";
4466 PyObject *errorHandler = NULL;
4467 PyObject *exc = NULL;
4468 /* the following variable is used for caching string comparisons
4469 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4470 * 3=ignore, 4=xmlcharrefreplace */
4471 int known_errorHandler = -1;
4472
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 if (mapping == NULL) {
4474 PyErr_BadArgument();
4475 return NULL;
4476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477
4478 /* allocate enough for a simple 1:1 translation without
4479 replacements, if we need more, we'll resize */
4480 res = PyUnicode_FromUnicode(NULL, size);
4481 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004482 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484 return res;
4485 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 while (p<endp) {
4488 /* try to encode it */
4489 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004490 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 goto onError;
4493 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004494 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 if (x!=Py_None) /* it worked => adjust input pointer */
4496 ++p;
4497 else { /* untranslatable character */
4498 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004499 Py_ssize_t repsize;
4500 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 Py_UNICODE *uni2;
4502 /* startpos for collecting untranslatable chars */
4503 const Py_UNICODE *collstart = p;
4504 const Py_UNICODE *collend = p+1;
4505 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 /* find all untranslatable characters */
4508 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004509 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 goto onError;
4511 Py_XDECREF(x);
4512 if (x!=Py_None)
4513 break;
4514 ++collend;
4515 }
4516 /* cache callback name lookup
4517 * (if not done yet, i.e. it's the first error) */
4518 if (known_errorHandler==-1) {
4519 if ((errors==NULL) || (!strcmp(errors, "strict")))
4520 known_errorHandler = 1;
4521 else if (!strcmp(errors, "replace"))
4522 known_errorHandler = 2;
4523 else if (!strcmp(errors, "ignore"))
4524 known_errorHandler = 3;
4525 else if (!strcmp(errors, "xmlcharrefreplace"))
4526 known_errorHandler = 4;
4527 else
4528 known_errorHandler = 0;
4529 }
4530 switch (known_errorHandler) {
4531 case 1: /* strict */
4532 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4533 goto onError;
4534 case 2: /* replace */
4535 /* No need to check for space, this is a 1:1 replacement */
4536 for (coll = collstart; coll<collend; ++coll)
4537 *str++ = '?';
4538 /* fall through */
4539 case 3: /* ignore */
4540 p = collend;
4541 break;
4542 case 4: /* xmlcharrefreplace */
4543 /* generate replacement (temporarily (mis)uses p) */
4544 for (p = collstart; p < collend; ++p) {
4545 char buffer[2+29+1+1];
4546 char *cp;
4547 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004548 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4550 goto onError;
4551 for (cp = buffer; *cp; ++cp)
4552 *str++ = *cp;
4553 }
4554 p = collend;
4555 break;
4556 default:
4557 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4558 reason, startp, size, &exc,
4559 collstart-startp, collend-startp, &newpos);
4560 if (repunicode == NULL)
4561 goto onError;
4562 /* generate replacement */
4563 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004564 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4566 Py_DECREF(repunicode);
4567 goto onError;
4568 }
4569 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4570 *str++ = *uni2;
4571 p = startp + newpos;
4572 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573 }
4574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 /* Resize if we allocated to much */
4577 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004578 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004579 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004580 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 }
4582 Py_XDECREF(exc);
4583 Py_XDECREF(errorHandler);
4584 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 onError:
4587 Py_XDECREF(res);
4588 Py_XDECREF(exc);
4589 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 return NULL;
4591}
4592
4593PyObject *PyUnicode_Translate(PyObject *str,
4594 PyObject *mapping,
4595 const char *errors)
4596{
4597 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004598
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 str = PyUnicode_FromObject(str);
4600 if (str == NULL)
4601 goto onError;
4602 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4603 PyUnicode_GET_SIZE(str),
4604 mapping,
4605 errors);
4606 Py_DECREF(str);
4607 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004608
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609 onError:
4610 Py_XDECREF(str);
4611 return NULL;
4612}
Tim Petersced69f82003-09-16 20:30:58 +00004613
Guido van Rossum9e896b32000-04-05 20:11:21 +00004614/* --- Decimal Encoder ---------------------------------------------------- */
4615
4616int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004617 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004618 char *output,
4619 const char *errors)
4620{
4621 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622 PyObject *errorHandler = NULL;
4623 PyObject *exc = NULL;
4624 const char *encoding = "decimal";
4625 const char *reason = "invalid decimal Unicode string";
4626 /* the following variable is used for caching string comparisons
4627 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4628 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004629
4630 if (output == NULL) {
4631 PyErr_BadArgument();
4632 return -1;
4633 }
4634
4635 p = s;
4636 end = s + length;
4637 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004639 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004641 Py_ssize_t repsize;
4642 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 Py_UNICODE *uni2;
4644 Py_UNICODE *collstart;
4645 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004646
Guido van Rossum9e896b32000-04-05 20:11:21 +00004647 if (Py_UNICODE_ISSPACE(ch)) {
4648 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004650 continue;
4651 }
4652 decimal = Py_UNICODE_TODECIMAL(ch);
4653 if (decimal >= 0) {
4654 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004656 continue;
4657 }
Guido van Rossumba477042000-04-06 18:18:10 +00004658 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004659 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004661 continue;
4662 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 /* All other characters are considered unencodable */
4664 collstart = p;
4665 collend = p+1;
4666 while (collend < end) {
4667 if ((0 < *collend && *collend < 256) ||
4668 !Py_UNICODE_ISSPACE(*collend) ||
4669 Py_UNICODE_TODECIMAL(*collend))
4670 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004671 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 /* cache callback name lookup
4673 * (if not done yet, i.e. it's the first error) */
4674 if (known_errorHandler==-1) {
4675 if ((errors==NULL) || (!strcmp(errors, "strict")))
4676 known_errorHandler = 1;
4677 else if (!strcmp(errors, "replace"))
4678 known_errorHandler = 2;
4679 else if (!strcmp(errors, "ignore"))
4680 known_errorHandler = 3;
4681 else if (!strcmp(errors, "xmlcharrefreplace"))
4682 known_errorHandler = 4;
4683 else
4684 known_errorHandler = 0;
4685 }
4686 switch (known_errorHandler) {
4687 case 1: /* strict */
4688 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4689 goto onError;
4690 case 2: /* replace */
4691 for (p = collstart; p < collend; ++p)
4692 *output++ = '?';
4693 /* fall through */
4694 case 3: /* ignore */
4695 p = collend;
4696 break;
4697 case 4: /* xmlcharrefreplace */
4698 /* generate replacement (temporarily (mis)uses p) */
4699 for (p = collstart; p < collend; ++p)
4700 output += sprintf(output, "&#%d;", (int)*p);
4701 p = collend;
4702 break;
4703 default:
4704 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4705 encoding, reason, s, length, &exc,
4706 collstart-s, collend-s, &newpos);
4707 if (repunicode == NULL)
4708 goto onError;
4709 /* generate replacement */
4710 repsize = PyUnicode_GET_SIZE(repunicode);
4711 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4712 Py_UNICODE ch = *uni2;
4713 if (Py_UNICODE_ISSPACE(ch))
4714 *output++ = ' ';
4715 else {
4716 decimal = Py_UNICODE_TODECIMAL(ch);
4717 if (decimal >= 0)
4718 *output++ = '0' + decimal;
4719 else if (0 < ch && ch < 256)
4720 *output++ = (char)ch;
4721 else {
4722 Py_DECREF(repunicode);
4723 raise_encode_exception(&exc, encoding,
4724 s, length, collstart-s, collend-s, reason);
4725 goto onError;
4726 }
4727 }
4728 }
4729 p = s + newpos;
4730 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004731 }
4732 }
4733 /* 0-terminate the output string */
4734 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 Py_XDECREF(exc);
4736 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004737 return 0;
4738
4739 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004740 Py_XDECREF(exc);
4741 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004742 return -1;
4743}
4744
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745/* --- Helpers ------------------------------------------------------------ */
4746
Thomas Wouters477c8d52006-05-27 19:21:47 +00004747#define STRINGLIB_CHAR Py_UNICODE
4748
4749#define STRINGLIB_LEN PyUnicode_GET_SIZE
4750#define STRINGLIB_NEW PyUnicode_FromUnicode
4751#define STRINGLIB_STR PyUnicode_AS_UNICODE
4752
4753Py_LOCAL_INLINE(int)
4754STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755{
Thomas Wouters477c8d52006-05-27 19:21:47 +00004756 if (str[0] != other[0])
4757 return 1;
4758 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759}
4760
Thomas Wouters477c8d52006-05-27 19:21:47 +00004761#define STRINGLIB_EMPTY unicode_empty
4762
4763#include "stringlib/fastsearch.h"
4764
4765#include "stringlib/count.h"
4766#include "stringlib/find.h"
4767#include "stringlib/partition.h"
4768
4769/* helper macro to fixup start/end slice values */
4770#define FIX_START_END(obj) \
4771 if (start < 0) \
4772 start += (obj)->length; \
4773 if (start < 0) \
4774 start = 0; \
4775 if (end > (obj)->length) \
4776 end = (obj)->length; \
4777 if (end < 0) \
4778 end += (obj)->length; \
4779 if (end < 0) \
4780 end = 0;
4781
Martin v. Löwis18e16552006-02-15 17:27:45 +00004782Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004783 PyObject *substr,
4784 Py_ssize_t start,
4785 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004787 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004788 PyUnicodeObject* str_obj;
4789 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004790
Thomas Wouters477c8d52006-05-27 19:21:47 +00004791 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4792 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004794 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4795 if (!sub_obj) {
4796 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 return -1;
4798 }
Tim Petersced69f82003-09-16 20:30:58 +00004799
Thomas Wouters477c8d52006-05-27 19:21:47 +00004800 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004801
Thomas Wouters477c8d52006-05-27 19:21:47 +00004802 result = stringlib_count(
4803 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4804 );
4805
4806 Py_DECREF(sub_obj);
4807 Py_DECREF(str_obj);
4808
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 return result;
4810}
4811
Martin v. Löwis18e16552006-02-15 17:27:45 +00004812Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00004813 PyObject *sub,
4814 Py_ssize_t start,
4815 Py_ssize_t end,
4816 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004818 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004819
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004821 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004822 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00004823 sub = PyUnicode_FromObject(sub);
4824 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004825 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004826 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827 }
Tim Petersced69f82003-09-16 20:30:58 +00004828
Thomas Wouters477c8d52006-05-27 19:21:47 +00004829 if (direction > 0)
4830 result = stringlib_find_slice(
4831 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4832 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4833 start, end
4834 );
4835 else
4836 result = stringlib_rfind_slice(
4837 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4838 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4839 start, end
4840 );
4841
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00004843 Py_DECREF(sub);
4844
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 return result;
4846}
4847
Tim Petersced69f82003-09-16 20:30:58 +00004848static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849int tailmatch(PyUnicodeObject *self,
4850 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 Py_ssize_t start,
4852 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 int direction)
4854{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 if (substring->length == 0)
4856 return 1;
4857
Thomas Wouters477c8d52006-05-27 19:21:47 +00004858 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859
4860 end -= substring->length;
4861 if (end < start)
4862 return 0;
4863
4864 if (direction > 0) {
4865 if (Py_UNICODE_MATCH(self, end, substring))
4866 return 1;
4867 } else {
4868 if (Py_UNICODE_MATCH(self, start, substring))
4869 return 1;
4870 }
4871
4872 return 0;
4873}
4874
Martin v. Löwis18e16552006-02-15 17:27:45 +00004875Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004877 Py_ssize_t start,
4878 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 int direction)
4880{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004881 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004882
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 str = PyUnicode_FromObject(str);
4884 if (str == NULL)
4885 return -1;
4886 substr = PyUnicode_FromObject(substr);
4887 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004888 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 return -1;
4890 }
Tim Petersced69f82003-09-16 20:30:58 +00004891
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 result = tailmatch((PyUnicodeObject *)str,
4893 (PyUnicodeObject *)substr,
4894 start, end, direction);
4895 Py_DECREF(str);
4896 Py_DECREF(substr);
4897 return result;
4898}
4899
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900/* Apply fixfct filter to the Unicode object self and return a
4901 reference to the modified object */
4902
Tim Petersced69f82003-09-16 20:30:58 +00004903static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904PyObject *fixup(PyUnicodeObject *self,
4905 int (*fixfct)(PyUnicodeObject *s))
4906{
4907
4908 PyUnicodeObject *u;
4909
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004910 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 if (u == NULL)
4912 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004913
4914 Py_UNICODE_COPY(u->str, self->str, self->length);
4915
Tim Peters7a29bd52001-09-12 03:03:31 +00004916 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 /* fixfct should return TRUE if it modified the buffer. If
4918 FALSE, return a reference to the original buffer instead
4919 (to save space, not time) */
4920 Py_INCREF(self);
4921 Py_DECREF(u);
4922 return (PyObject*) self;
4923 }
4924 return (PyObject*) u;
4925}
4926
Tim Petersced69f82003-09-16 20:30:58 +00004927static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928int fixupper(PyUnicodeObject *self)
4929{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004930 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 Py_UNICODE *s = self->str;
4932 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004933
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 while (len-- > 0) {
4935 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004936
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 ch = Py_UNICODE_TOUPPER(*s);
4938 if (ch != *s) {
4939 status = 1;
4940 *s = ch;
4941 }
4942 s++;
4943 }
4944
4945 return status;
4946}
4947
Tim Petersced69f82003-09-16 20:30:58 +00004948static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949int fixlower(PyUnicodeObject *self)
4950{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004951 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 Py_UNICODE *s = self->str;
4953 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004954
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 while (len-- > 0) {
4956 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004957
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 ch = Py_UNICODE_TOLOWER(*s);
4959 if (ch != *s) {
4960 status = 1;
4961 *s = ch;
4962 }
4963 s++;
4964 }
4965
4966 return status;
4967}
4968
Tim Petersced69f82003-09-16 20:30:58 +00004969static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970int fixswapcase(PyUnicodeObject *self)
4971{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 Py_UNICODE *s = self->str;
4974 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004975
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976 while (len-- > 0) {
4977 if (Py_UNICODE_ISUPPER(*s)) {
4978 *s = Py_UNICODE_TOLOWER(*s);
4979 status = 1;
4980 } else if (Py_UNICODE_ISLOWER(*s)) {
4981 *s = Py_UNICODE_TOUPPER(*s);
4982 status = 1;
4983 }
4984 s++;
4985 }
4986
4987 return status;
4988}
4989
Tim Petersced69f82003-09-16 20:30:58 +00004990static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991int fixcapitalize(PyUnicodeObject *self)
4992{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004993 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004994 Py_UNICODE *s = self->str;
4995 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004996
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004997 if (len == 0)
4998 return 0;
4999 if (Py_UNICODE_ISLOWER(*s)) {
5000 *s = Py_UNICODE_TOUPPER(*s);
5001 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005003 s++;
5004 while (--len > 0) {
5005 if (Py_UNICODE_ISUPPER(*s)) {
5006 *s = Py_UNICODE_TOLOWER(*s);
5007 status = 1;
5008 }
5009 s++;
5010 }
5011 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012}
5013
5014static
5015int fixtitle(PyUnicodeObject *self)
5016{
5017 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5018 register Py_UNICODE *e;
5019 int previous_is_cased;
5020
5021 /* Shortcut for single character strings */
5022 if (PyUnicode_GET_SIZE(self) == 1) {
5023 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5024 if (*p != ch) {
5025 *p = ch;
5026 return 1;
5027 }
5028 else
5029 return 0;
5030 }
Tim Petersced69f82003-09-16 20:30:58 +00005031
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 e = p + PyUnicode_GET_SIZE(self);
5033 previous_is_cased = 0;
5034 for (; p < e; p++) {
5035 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005036
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 if (previous_is_cased)
5038 *p = Py_UNICODE_TOLOWER(ch);
5039 else
5040 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005041
5042 if (Py_UNICODE_ISLOWER(ch) ||
5043 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 Py_UNICODE_ISTITLE(ch))
5045 previous_is_cased = 1;
5046 else
5047 previous_is_cased = 0;
5048 }
5049 return 1;
5050}
5051
Tim Peters8ce9f162004-08-27 01:49:32 +00005052PyObject *
5053PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054{
Tim Peters8ce9f162004-08-27 01:49:32 +00005055 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005056 const Py_UNICODE blank = ' ';
5057 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005058 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005059 PyUnicodeObject *res = NULL; /* the result */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005060 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5061 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005062 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5063 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005064 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005065 PyObject *item;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005066 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067
Tim Peters05eba1f2004-08-27 21:32:02 +00005068 fseq = PySequence_Fast(seq, "");
5069 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005070 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005071 }
5072
Tim Peters91879ab2004-08-27 22:35:44 +00005073 /* Grrrr. A codec may be invoked to convert str objects to
5074 * Unicode, and so it's possible to call back into Python code
5075 * during PyUnicode_FromObject(), and so it's possible for a sick
5076 * codec to change the size of fseq (if seq is a list). Therefore
5077 * we have to keep refetching the size -- can't assume seqlen
5078 * is invariant.
5079 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005080 seqlen = PySequence_Fast_GET_SIZE(fseq);
5081 /* If empty sequence, return u"". */
5082 if (seqlen == 0) {
5083 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5084 goto Done;
5085 }
5086 /* If singleton sequence with an exact Unicode, return that. */
5087 if (seqlen == 1) {
5088 item = PySequence_Fast_GET_ITEM(fseq, 0);
5089 if (PyUnicode_CheckExact(item)) {
5090 Py_INCREF(item);
5091 res = (PyUnicodeObject *)item;
5092 goto Done;
5093 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005094 }
5095
Tim Peters05eba1f2004-08-27 21:32:02 +00005096 /* At least two items to join, or one that isn't exact Unicode. */
5097 if (seqlen > 1) {
5098 /* Set up sep and seplen -- they're needed. */
5099 if (separator == NULL) {
5100 sep = &blank;
5101 seplen = 1;
5102 }
5103 else {
5104 internal_separator = PyUnicode_FromObject(separator);
5105 if (internal_separator == NULL)
5106 goto onError;
5107 sep = PyUnicode_AS_UNICODE(internal_separator);
5108 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00005109 /* In case PyUnicode_FromObject() mutated seq. */
5110 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005111 }
5112 }
5113
5114 /* Get space. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005115 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005116 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005117 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005118 res_p = PyUnicode_AS_UNICODE(res);
5119 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005120
Tim Peters05eba1f2004-08-27 21:32:02 +00005121 for (i = 0; i < seqlen; ++i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005122 Py_ssize_t itemlen;
5123 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005124
5125 item = PySequence_Fast_GET_ITEM(fseq, i);
5126 /* Convert item to Unicode. */
5127 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5128 PyErr_Format(PyExc_TypeError,
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005129 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00005130 " %.80s found",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00005131 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00005132 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005133 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005134 item = PyUnicode_FromObject(item);
5135 if (item == NULL)
5136 goto onError;
5137 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005138
Tim Peters91879ab2004-08-27 22:35:44 +00005139 /* In case PyUnicode_FromObject() mutated seq. */
5140 seqlen = PySequence_Fast_GET_SIZE(fseq);
5141
Tim Peters8ce9f162004-08-27 01:49:32 +00005142 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005144 new_res_used = res_used + itemlen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005145 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005146 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005147 if (i < seqlen - 1) {
5148 new_res_used += seplen;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005149 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00005150 goto Overflow;
5151 }
5152 if (new_res_used > res_alloc) {
5153 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00005154 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00005155 res_alloc += res_alloc;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005156 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005157 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00005158 } while (new_res_used > res_alloc);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005159 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005160 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00005162 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005163 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005165
5166 /* Copy item, and maybe the separator. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005167 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005168 res_p += itemlen;
5169 if (i < seqlen - 1) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005170 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005171 res_p += seplen;
5172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005174 res_used = new_res_used;
5175 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005176
Tim Peters05eba1f2004-08-27 21:32:02 +00005177 /* Shrink res to match the used area; this probably can't fail,
5178 * but it's cheap to check.
5179 */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005180 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00005181 goto onError;
5182
5183 Done:
5184 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005185 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 return (PyObject *)res;
5187
Tim Peters8ce9f162004-08-27 01:49:32 +00005188 Overflow:
5189 PyErr_SetString(PyExc_OverflowError,
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005190 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005191 Py_DECREF(item);
5192 /* fall through */
5193
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005195 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005196 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005197 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 return NULL;
5199}
5200
Tim Petersced69f82003-09-16 20:30:58 +00005201static
5202PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t left,
5204 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 Py_UNICODE fill)
5206{
5207 PyUnicodeObject *u;
5208
5209 if (left < 0)
5210 left = 0;
5211 if (right < 0)
5212 right = 0;
5213
Tim Peters7a29bd52001-09-12 03:03:31 +00005214 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 Py_INCREF(self);
5216 return self;
5217 }
5218
5219 u = _PyUnicode_New(left + self->length + right);
5220 if (u) {
5221 if (left)
5222 Py_UNICODE_FILL(u->str, fill, left);
5223 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5224 if (right)
5225 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5226 }
5227
5228 return u;
5229}
5230
5231#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005232 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 if (!str) \
5234 goto onError; \
5235 if (PyList_Append(list, str)) { \
5236 Py_DECREF(str); \
5237 goto onError; \
5238 } \
5239 else \
5240 Py_DECREF(str);
5241
5242static
5243PyObject *split_whitespace(PyUnicodeObject *self,
5244 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005245 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005247 register Py_ssize_t i;
5248 register Py_ssize_t j;
5249 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 PyObject *str;
5251
5252 for (i = j = 0; i < len; ) {
5253 /* find a token */
5254 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5255 i++;
5256 j = i;
5257 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5258 i++;
5259 if (j < i) {
5260 if (maxcount-- <= 0)
5261 break;
5262 SPLIT_APPEND(self->str, j, i);
5263 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5264 i++;
5265 j = i;
5266 }
5267 }
5268 if (j < len) {
5269 SPLIT_APPEND(self->str, j, len);
5270 }
5271 return list;
5272
5273 onError:
5274 Py_DECREF(list);
5275 return NULL;
5276}
5277
5278PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005279 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005281 register Py_ssize_t i;
5282 register Py_ssize_t j;
5283 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 PyObject *list;
5285 PyObject *str;
5286 Py_UNICODE *data;
5287
5288 string = PyUnicode_FromObject(string);
5289 if (string == NULL)
5290 return NULL;
5291 data = PyUnicode_AS_UNICODE(string);
5292 len = PyUnicode_GET_SIZE(string);
5293
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 list = PyList_New(0);
5295 if (!list)
5296 goto onError;
5297
5298 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005299 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005302 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
5305 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005306 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 if (i < len) {
5308 if (data[i] == '\r' && i + 1 < len &&
5309 data[i+1] == '\n')
5310 i += 2;
5311 else
5312 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005313 if (keepends)
5314 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 }
Guido van Rossum86662912000-04-11 15:38:46 +00005316 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 j = i;
5318 }
5319 if (j < len) {
5320 SPLIT_APPEND(data, j, len);
5321 }
5322
5323 Py_DECREF(string);
5324 return list;
5325
5326 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005327 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 Py_DECREF(string);
5329 return NULL;
5330}
5331
Tim Petersced69f82003-09-16 20:30:58 +00005332static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333PyObject *split_char(PyUnicodeObject *self,
5334 PyObject *list,
5335 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 register Py_ssize_t i;
5339 register Py_ssize_t j;
5340 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 PyObject *str;
5342
5343 for (i = j = 0; i < len; ) {
5344 if (self->str[i] == ch) {
5345 if (maxcount-- <= 0)
5346 break;
5347 SPLIT_APPEND(self->str, j, i);
5348 i = j = i + 1;
5349 } else
5350 i++;
5351 }
5352 if (j <= len) {
5353 SPLIT_APPEND(self->str, j, len);
5354 }
5355 return list;
5356
5357 onError:
5358 Py_DECREF(list);
5359 return NULL;
5360}
5361
Tim Petersced69f82003-09-16 20:30:58 +00005362static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363PyObject *split_substring(PyUnicodeObject *self,
5364 PyObject *list,
5365 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005366 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005368 register Py_ssize_t i;
5369 register Py_ssize_t j;
5370 Py_ssize_t len = self->length;
5371 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 PyObject *str;
5373
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005374 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 if (Py_UNICODE_MATCH(self, i, substring)) {
5376 if (maxcount-- <= 0)
5377 break;
5378 SPLIT_APPEND(self->str, j, i);
5379 i = j = i + sublen;
5380 } else
5381 i++;
5382 }
5383 if (j <= len) {
5384 SPLIT_APPEND(self->str, j, len);
5385 }
5386 return list;
5387
5388 onError:
5389 Py_DECREF(list);
5390 return NULL;
5391}
5392
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005393static
5394PyObject *rsplit_whitespace(PyUnicodeObject *self,
5395 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005396 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005397{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005398 register Py_ssize_t i;
5399 register Py_ssize_t j;
5400 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005401 PyObject *str;
5402
5403 for (i = j = len - 1; i >= 0; ) {
5404 /* find a token */
5405 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5406 i--;
5407 j = i;
5408 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5409 i--;
5410 if (j > i) {
5411 if (maxcount-- <= 0)
5412 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005413 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005414 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5415 i--;
5416 j = i;
5417 }
5418 }
5419 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005420 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005421 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005422 if (PyList_Reverse(list) < 0)
5423 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005424 return list;
5425
5426 onError:
5427 Py_DECREF(list);
5428 return NULL;
5429}
5430
5431static
5432PyObject *rsplit_char(PyUnicodeObject *self,
5433 PyObject *list,
5434 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005435 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005436{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005437 register Py_ssize_t i;
5438 register Py_ssize_t j;
5439 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005440 PyObject *str;
5441
5442 for (i = j = len - 1; i >= 0; ) {
5443 if (self->str[i] == ch) {
5444 if (maxcount-- <= 0)
5445 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005446 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005447 j = i = i - 1;
5448 } else
5449 i--;
5450 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005451 if (j >= -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005452 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005453 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005454 if (PyList_Reverse(list) < 0)
5455 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005456 return list;
5457
5458 onError:
5459 Py_DECREF(list);
5460 return NULL;
5461}
5462
5463static
5464PyObject *rsplit_substring(PyUnicodeObject *self,
5465 PyObject *list,
5466 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005467 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005468{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005469 register Py_ssize_t i;
5470 register Py_ssize_t j;
5471 Py_ssize_t len = self->length;
5472 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005473 PyObject *str;
5474
5475 for (i = len - sublen, j = len; i >= 0; ) {
5476 if (Py_UNICODE_MATCH(self, i, substring)) {
5477 if (maxcount-- <= 0)
5478 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005479 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005480 j = i;
5481 i -= sublen;
5482 } else
5483 i--;
5484 }
5485 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005486 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005487 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005488 if (PyList_Reverse(list) < 0)
5489 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005490 return list;
5491
5492 onError:
5493 Py_DECREF(list);
5494 return NULL;
5495}
5496
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497#undef SPLIT_APPEND
5498
5499static
5500PyObject *split(PyUnicodeObject *self,
5501 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005502 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503{
5504 PyObject *list;
5505
5506 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005507 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508
5509 list = PyList_New(0);
5510 if (!list)
5511 return NULL;
5512
5513 if (substring == NULL)
5514 return split_whitespace(self,list,maxcount);
5515
5516 else if (substring->length == 1)
5517 return split_char(self,list,substring->str[0],maxcount);
5518
5519 else if (substring->length == 0) {
5520 Py_DECREF(list);
5521 PyErr_SetString(PyExc_ValueError, "empty separator");
5522 return NULL;
5523 }
5524 else
5525 return split_substring(self,list,substring,maxcount);
5526}
5527
Tim Petersced69f82003-09-16 20:30:58 +00005528static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005529PyObject *rsplit(PyUnicodeObject *self,
5530 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005531 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005532{
5533 PyObject *list;
5534
5535 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005536 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005537
5538 list = PyList_New(0);
5539 if (!list)
5540 return NULL;
5541
5542 if (substring == NULL)
5543 return rsplit_whitespace(self,list,maxcount);
5544
5545 else if (substring->length == 1)
5546 return rsplit_char(self,list,substring->str[0],maxcount);
5547
5548 else if (substring->length == 0) {
5549 Py_DECREF(list);
5550 PyErr_SetString(PyExc_ValueError, "empty separator");
5551 return NULL;
5552 }
5553 else
5554 return rsplit_substring(self,list,substring,maxcount);
5555}
5556
5557static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558PyObject *replace(PyUnicodeObject *self,
5559 PyUnicodeObject *str1,
5560 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562{
5563 PyUnicodeObject *u;
5564
5565 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005566 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
Thomas Wouters477c8d52006-05-27 19:21:47 +00005568 if (str1->length == str2->length) {
5569 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005570 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005571 if (str1->length == 1) {
5572 /* replace characters */
5573 Py_UNICODE u1, u2;
5574 if (!findchar(self->str, self->length, str1->str[0]))
5575 goto nothing;
5576 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5577 if (!u)
5578 return NULL;
5579 Py_UNICODE_COPY(u->str, self->str, self->length);
5580 u1 = str1->str[0];
5581 u2 = str2->str[0];
5582 for (i = 0; i < u->length; i++)
5583 if (u->str[i] == u1) {
5584 if (--maxcount < 0)
5585 break;
5586 u->str[i] = u2;
5587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005589 i = fastsearch(
5590 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00005592 if (i < 0)
5593 goto nothing;
5594 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5595 if (!u)
5596 return NULL;
5597 Py_UNICODE_COPY(u->str, self->str, self->length);
5598 while (i <= self->length - str1->length)
5599 if (Py_UNICODE_MATCH(self, i, str1)) {
5600 if (--maxcount < 0)
5601 break;
5602 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5603 i += str1->length;
5604 } else
5605 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005608
5609 Py_ssize_t n, i, j, e;
5610 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 Py_UNICODE *p;
5612
5613 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00005614 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 if (n > maxcount)
5616 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005617 if (n == 0)
5618 goto nothing;
5619 /* new_size = self->length + n * (str2->length - str1->length)); */
5620 delta = (str2->length - str1->length);
5621 if (delta == 0) {
5622 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005624 product = n * (str2->length - str1->length);
5625 if ((product / (str2->length - str1->length)) != n) {
5626 PyErr_SetString(PyExc_OverflowError,
5627 "replace string is too long");
5628 return NULL;
5629 }
5630 new_size = self->length + product;
5631 if (new_size < 0) {
5632 PyErr_SetString(PyExc_OverflowError,
5633 "replace string is too long");
5634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 }
5636 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005637 u = _PyUnicode_New(new_size);
5638 if (!u)
5639 return NULL;
5640 i = 0;
5641 p = u->str;
5642 e = self->length - str1->length;
5643 if (str1->length > 0) {
5644 while (n-- > 0) {
5645 /* look for next match */
5646 j = i;
5647 while (j <= e) {
5648 if (Py_UNICODE_MATCH(self, j, str1))
5649 break;
5650 j++;
5651 }
5652 if (j > i) {
5653 if (j > e)
5654 break;
5655 /* copy unchanged part [i:j] */
5656 Py_UNICODE_COPY(p, self->str+i, j-i);
5657 p += j - i;
5658 }
5659 /* copy substitution string */
5660 if (str2->length > 0) {
5661 Py_UNICODE_COPY(p, str2->str, str2->length);
5662 p += str2->length;
5663 }
5664 i = j + str1->length;
5665 }
5666 if (i < self->length)
5667 /* copy tail [i:] */
5668 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5669 } else {
5670 /* interleave */
5671 while (n > 0) {
5672 Py_UNICODE_COPY(p, str2->str, str2->length);
5673 p += str2->length;
5674 if (--n <= 0)
5675 break;
5676 *p++ = self->str[i++];
5677 }
5678 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005682
5683nothing:
5684 /* nothing to replace; return original string (when possible) */
5685 if (PyUnicode_CheckExact(self)) {
5686 Py_INCREF(self);
5687 return (PyObject *) self;
5688 }
5689 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690}
5691
5692/* --- Unicode Object Methods --------------------------------------------- */
5693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005694PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695"S.title() -> unicode\n\
5696\n\
5697Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005698characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
5700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005701unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 return fixup(self, fixtitle);
5704}
5705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005706PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707"S.capitalize() -> unicode\n\
5708\n\
5709Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005710have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711
5712static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005713unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 return fixup(self, fixcapitalize);
5716}
5717
5718#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005719PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720"S.capwords() -> unicode\n\
5721\n\
5722Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005723normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
5725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005726unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727{
5728 PyObject *list;
5729 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005730 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 /* Split into words */
5733 list = split(self, NULL, -1);
5734 if (!list)
5735 return NULL;
5736
5737 /* Capitalize each word */
5738 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5739 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5740 fixcapitalize);
5741 if (item == NULL)
5742 goto onError;
5743 Py_DECREF(PyList_GET_ITEM(list, i));
5744 PyList_SET_ITEM(list, i, item);
5745 }
5746
5747 /* Join the words to form a new string */
5748 item = PyUnicode_Join(NULL, list);
5749
5750onError:
5751 Py_DECREF(list);
5752 return (PyObject *)item;
5753}
5754#endif
5755
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005756/* Argument converter. Coerces to a single unicode character */
5757
5758static int
5759convert_uc(PyObject *obj, void *addr)
5760{
5761 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5762 PyObject *uniobj;
5763 Py_UNICODE *unistr;
5764
5765 uniobj = PyUnicode_FromObject(obj);
5766 if (uniobj == NULL) {
5767 PyErr_SetString(PyExc_TypeError,
5768 "The fill character cannot be converted to Unicode");
5769 return 0;
5770 }
5771 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5772 PyErr_SetString(PyExc_TypeError,
5773 "The fill character must be exactly one character long");
5774 Py_DECREF(uniobj);
5775 return 0;
5776 }
5777 unistr = PyUnicode_AS_UNICODE(uniobj);
5778 *fillcharloc = unistr[0];
5779 Py_DECREF(uniobj);
5780 return 1;
5781}
5782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005783PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005784"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005786Return S centered in a Unicode string of length width. Padding is\n\
5787done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788
5789static PyObject *
5790unicode_center(PyUnicodeObject *self, PyObject *args)
5791{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792 Py_ssize_t marg, left;
5793 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005794 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
Thomas Woutersde017742006-02-16 19:34:37 +00005796 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 return NULL;
5798
Tim Peters7a29bd52001-09-12 03:03:31 +00005799 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 Py_INCREF(self);
5801 return (PyObject*) self;
5802 }
5803
5804 marg = width - self->length;
5805 left = marg / 2 + (marg & width & 1);
5806
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005807 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808}
5809
Marc-André Lemburge5034372000-08-08 08:04:29 +00005810#if 0
5811
5812/* This code should go into some future Unicode collation support
5813 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005814 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005815
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005816/* speedy UTF-16 code point order comparison */
5817/* gleaned from: */
5818/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5819
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005820static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005821{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005822 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005823 0, 0, 0, 0, 0, 0, 0, 0,
5824 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005825 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005826};
5827
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828static int
5829unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5830{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005832
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 Py_UNICODE *s1 = str1->str;
5834 Py_UNICODE *s2 = str2->str;
5835
5836 len1 = str1->length;
5837 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005838
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005840 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005841
5842 c1 = *s1++;
5843 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005844
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005845 if (c1 > (1<<11) * 26)
5846 c1 += utf16Fixup[c1>>11];
5847 if (c2 > (1<<11) * 26)
5848 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005849 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005850
5851 if (c1 != c2)
5852 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005853
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005854 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 }
5856
5857 return (len1 < len2) ? -1 : (len1 != len2);
5858}
5859
Marc-André Lemburge5034372000-08-08 08:04:29 +00005860#else
5861
5862static int
5863unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5864{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005865 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005866
5867 Py_UNICODE *s1 = str1->str;
5868 Py_UNICODE *s2 = str2->str;
5869
5870 len1 = str1->length;
5871 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005872
Marc-André Lemburge5034372000-08-08 08:04:29 +00005873 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005874 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005875
Fredrik Lundh45714e92001-06-26 16:39:36 +00005876 c1 = *s1++;
5877 c2 = *s2++;
5878
5879 if (c1 != c2)
5880 return (c1 < c2) ? -1 : 1;
5881
Marc-André Lemburge5034372000-08-08 08:04:29 +00005882 len1--; len2--;
5883 }
5884
5885 return (len1 < len2) ? -1 : (len1 != len2);
5886}
5887
5888#endif
5889
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890int PyUnicode_Compare(PyObject *left,
5891 PyObject *right)
5892{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005893 if (PyUnicode_Check(left) && PyUnicode_Check(right))
5894 return unicode_compare((PyUnicodeObject *)left,
5895 (PyUnicodeObject *)right);
5896 if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5897 (PyUnicode_Check(left) && PyString_Check(right))) {
5898 if (PyUnicode_Check(left))
5899 left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5900 if (PyUnicode_Check(right))
5901 right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5902 assert(PyString_Check(left));
5903 assert(PyString_Check(right));
5904 return PyObject_Compare(left, right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00005906 PyErr_Format(PyExc_TypeError,
5907 "Can't compare %.100s and %.100s",
5908 left->ob_type->tp_name,
5909 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 return -1;
5911}
5912
Martin v. Löwis5b222132007-06-10 09:51:05 +00005913int
5914PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5915{
5916 int i;
5917 Py_UNICODE *id;
5918 assert(PyUnicode_Check(uni));
5919 id = PyUnicode_AS_UNICODE(uni);
5920 /* Compare Unicode string and source character set string */
5921 for (i = 0; id[i] && str[i]; i++)
5922 if (id[i] != str[i])
5923 return ((int)id[i] < (int)str[i]) ? -1 : 1;
5924 if (id[i])
5925 return 1; /* uni is longer */
5926 if (str[i])
5927 return -1; /* str is longer */
5928 return 0;
5929}
5930
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00005931PyObject *PyUnicode_RichCompare(PyObject *left,
5932 PyObject *right,
5933 int op)
5934{
5935 int result;
5936
5937 result = PyUnicode_Compare(left, right);
5938 if (result == -1 && PyErr_Occurred())
5939 goto onError;
5940
5941 /* Convert the return value to a Boolean */
5942 switch (op) {
5943 case Py_EQ:
5944 result = (result == 0);
5945 break;
5946 case Py_NE:
5947 result = (result != 0);
5948 break;
5949 case Py_LE:
5950 result = (result <= 0);
5951 break;
5952 case Py_GE:
5953 result = (result >= 0);
5954 break;
5955 case Py_LT:
5956 result = (result == -1);
5957 break;
5958 case Py_GT:
5959 result = (result == 1);
5960 break;
5961 }
5962 return PyBool_FromLong(result);
5963
5964 onError:
5965
5966 /* Standard case
5967
5968 Type errors mean that PyUnicode_FromObject() could not convert
5969 one of the arguments (usually the right hand side) to Unicode,
5970 ie. we can't handle the comparison request. However, it is
5971 possible that the other object knows a comparison method, which
5972 is why we return Py_NotImplemented to give the other object a
5973 chance.
5974
5975 */
5976 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5977 PyErr_Clear();
5978 Py_INCREF(Py_NotImplemented);
5979 return Py_NotImplemented;
5980 }
5981 if (op != Py_EQ && op != Py_NE)
5982 return NULL;
5983
5984 /* Equality comparison.
5985
5986 This is a special case: we silence any PyExc_UnicodeDecodeError
5987 and instead turn it into a PyErr_UnicodeWarning.
5988
5989 */
5990 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5991 return NULL;
5992 PyErr_Clear();
5993 if (PyErr_Warn(PyExc_UnicodeWarning,
5994 (op == Py_EQ) ?
5995 "Unicode equal comparison "
5996 "failed to convert both arguments to Unicode - "
5997 "interpreting them as being unequal" :
5998 "Unicode unequal comparison "
5999 "failed to convert both arguments to Unicode - "
6000 "interpreting them as being unequal"
6001 ) < 0)
6002 return NULL;
6003 result = (op == Py_NE);
6004 return PyBool_FromLong(result);
6005}
6006
Guido van Rossum403d68b2000-03-13 15:55:09 +00006007int PyUnicode_Contains(PyObject *container,
6008 PyObject *element)
6009{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006010 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006011 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006012
6013 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006014 sub = PyUnicode_FromObject(element);
6015 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006016 PyErr_Format(PyExc_TypeError,
6017 "'in <string>' requires string as left operand, not %s",
6018 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006019 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006020 }
6021
Thomas Wouters477c8d52006-05-27 19:21:47 +00006022 str = PyUnicode_FromObject(container);
6023 if (!str) {
6024 Py_DECREF(sub);
6025 return -1;
6026 }
6027
6028 result = stringlib_contains_obj(str, sub);
6029
6030 Py_DECREF(str);
6031 Py_DECREF(sub);
6032
Guido van Rossum403d68b2000-03-13 15:55:09 +00006033 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006034}
6035
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036/* Concat to string or Unicode object giving a new Unicode object. */
6037
6038PyObject *PyUnicode_Concat(PyObject *left,
6039 PyObject *right)
6040{
6041 PyUnicodeObject *u = NULL, *v = NULL, *w;
6042
Guido van Rossum84d79dd2007-04-13 02:23:57 +00006043 if (PyBytes_Check(left) || PyBytes_Check(right))
6044 return PyBytes_Concat(left, right);
6045
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 /* Coerce the two arguments */
6047 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6048 if (u == NULL)
6049 goto onError;
6050 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6051 if (v == NULL)
6052 goto onError;
6053
6054 /* Shortcuts */
6055 if (v == unicode_empty) {
6056 Py_DECREF(v);
6057 return (PyObject *)u;
6058 }
6059 if (u == unicode_empty) {
6060 Py_DECREF(u);
6061 return (PyObject *)v;
6062 }
6063
6064 /* Concat the two Unicode strings */
6065 w = _PyUnicode_New(u->length + v->length);
6066 if (w == NULL)
6067 goto onError;
6068 Py_UNICODE_COPY(w->str, u->str, u->length);
6069 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6070
6071 Py_DECREF(u);
6072 Py_DECREF(v);
6073 return (PyObject *)w;
6074
6075onError:
6076 Py_XDECREF(u);
6077 Py_XDECREF(v);
6078 return NULL;
6079}
6080
Walter Dörwald1ab83302007-05-18 17:15:44 +00006081void
6082PyUnicode_Append(PyObject **pleft, PyObject *right)
6083{
6084 PyObject *new;
6085 if (*pleft == NULL)
6086 return;
6087 if (right == NULL || !PyUnicode_Check(*pleft)) {
6088 Py_DECREF(*pleft);
6089 *pleft = NULL;
6090 return;
6091 }
6092 new = PyUnicode_Concat(*pleft, right);
6093 Py_DECREF(*pleft);
6094 *pleft = new;
6095}
6096
6097void
6098PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6099{
6100 PyUnicode_Append(pleft, right);
6101 Py_XDECREF(right);
6102}
6103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006104PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105"S.count(sub[, start[, end]]) -> int\n\
6106\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006107Return the number of non-overlapping occurrences of substring sub in\n\
6108Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006109interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
6111static PyObject *
6112unicode_count(PyUnicodeObject *self, PyObject *args)
6113{
6114 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006115 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006116 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 PyObject *result;
6118
Guido van Rossumb8872e62000-05-09 14:14:27 +00006119 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6120 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 return NULL;
6122
6123 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006124 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 if (substring == NULL)
6126 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006127
Thomas Wouters477c8d52006-05-27 19:21:47 +00006128 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129
Thomas Wouters477c8d52006-05-27 19:21:47 +00006130 result = PyInt_FromSsize_t(
6131 stringlib_count(self->str + start, end - start,
6132 substring->str, substring->length)
6133 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
6135 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006136
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 return result;
6138}
6139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006140PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006141"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006143Encodes S using the codec registered for encoding. encoding defaults\n\
6144to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006145handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006146a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6147'xmlcharrefreplace' as well as any other name registered with\n\
6148codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
6150static PyObject *
6151unicode_encode(PyUnicodeObject *self, PyObject *args)
6152{
6153 char *encoding = NULL;
6154 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006155 PyObject *v;
6156
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6158 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006159 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006160 if (v == NULL)
6161 goto onError;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006162 if (!PyBytes_Check(v)) {
Guido van Rossum4355a472007-05-04 05:00:04 +00006163 if (PyString_Check(v)) {
6164 /* Old codec, turn it into bytes */
6165 PyObject *b = PyBytes_FromObject(v);
6166 Py_DECREF(v);
6167 return b;
6168 }
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006169 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006170 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006171 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006172 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006173 Py_DECREF(v);
6174 return NULL;
6175 }
6176 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006177
6178 onError:
6179 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006180}
6181
6182PyDoc_STRVAR(decode__doc__,
6183"S.decode([encoding[,errors]]) -> string or unicode\n\
6184\n\
6185Decodes S using the codec registered for encoding. encoding defaults\n\
6186to the default encoding. errors may be given to set a different error\n\
6187handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6188a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6189as well as any other name registerd with codecs.register_error that is\n\
6190able to handle UnicodeDecodeErrors.");
6191
6192static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006193unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006194{
6195 char *encoding = NULL;
6196 char *errors = NULL;
6197 PyObject *v;
6198
6199 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6200 return NULL;
6201 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006202 if (v == NULL)
6203 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006204 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6205 PyErr_Format(PyExc_TypeError,
6206 "decoder did not return a string/unicode object "
6207 "(type=%.400s)",
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00006208 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006209 Py_DECREF(v);
6210 return NULL;
6211 }
6212 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006213
6214 onError:
6215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216}
6217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006218PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219"S.expandtabs([tabsize]) -> unicode\n\
6220\n\
6221Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006222If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223
6224static PyObject*
6225unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6226{
6227 Py_UNICODE *e;
6228 Py_UNICODE *p;
6229 Py_UNICODE *q;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006230 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 PyUnicodeObject *u;
6232 int tabsize = 8;
6233
6234 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6235 return NULL;
6236
Thomas Wouters7e474022000-07-16 12:04:32 +00006237 /* First pass: determine size of output string */
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006238 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 e = self->str + self->length;
6240 for (p = self->str; p < e; p++)
6241 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006242 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 j += tabsize - (j % tabsize);
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006244 if (old_j > j) {
6245 PyErr_SetString(PyExc_OverflowError,
6246 "new string is too long");
6247 return NULL;
6248 }
6249 old_j = j;
6250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
6252 else {
6253 j++;
6254 if (*p == '\n' || *p == '\r') {
6255 i += j;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006256 old_j = j = 0;
6257 if (i < 0) {
6258 PyErr_SetString(PyExc_OverflowError,
6259 "new string is too long");
6260 return NULL;
6261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 }
6263 }
6264
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006265 if ((i + j) < 0) {
6266 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6267 return NULL;
6268 }
6269
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 /* Second pass: create output string and fill it */
6271 u = _PyUnicode_New(i + j);
6272 if (!u)
6273 return NULL;
6274
6275 j = 0;
6276 q = u->str;
6277
6278 for (p = self->str; p < e; p++)
6279 if (*p == '\t') {
6280 if (tabsize > 0) {
6281 i = tabsize - (j % tabsize);
6282 j += i;
6283 while (i--)
6284 *q++ = ' ';
6285 }
6286 }
6287 else {
6288 j++;
6289 *q++ = *p;
6290 if (*p == '\n' || *p == '\r')
6291 j = 0;
6292 }
6293
6294 return (PyObject*) u;
6295}
6296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006297PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298"S.find(sub [,start [,end]]) -> int\n\
6299\n\
6300Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00006301such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302arguments start and end are interpreted as in slice notation.\n\
6303\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006304Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305
6306static PyObject *
6307unicode_find(PyUnicodeObject *self, PyObject *args)
6308{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006309 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006310 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006311 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006312 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
Guido van Rossumb8872e62000-05-09 14:14:27 +00006314 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6315 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006317 substring = PyUnicode_FromObject(substring);
6318 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 return NULL;
6320
Thomas Wouters477c8d52006-05-27 19:21:47 +00006321 result = stringlib_find_slice(
6322 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6323 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6324 start, end
6325 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
6327 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006328
6329 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330}
6331
6332static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006333unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334{
6335 if (index < 0 || index >= self->length) {
6336 PyErr_SetString(PyExc_IndexError, "string index out of range");
6337 return NULL;
6338 }
6339
6340 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6341}
6342
6343static long
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006344unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006346 /* Since Unicode objects compare equal to their UTF-8 string
6347 counterparts, we hash the UTF-8 string. */
6348 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6349 return PyObject_Hash(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350}
6351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006352PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353"S.index(sub [,start [,end]]) -> int\n\
6354\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006355Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356
6357static PyObject *
6358unicode_index(PyUnicodeObject *self, PyObject *args)
6359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006360 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006361 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006362 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006363 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364
Guido van Rossumb8872e62000-05-09 14:14:27 +00006365 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6366 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006368 substring = PyUnicode_FromObject(substring);
6369 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 return NULL;
6371
Thomas Wouters477c8d52006-05-27 19:21:47 +00006372 result = stringlib_find_slice(
6373 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6374 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6375 start, end
6376 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377
6378 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006379
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 if (result < 0) {
6381 PyErr_SetString(PyExc_ValueError, "substring not found");
6382 return NULL;
6383 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006384
Martin v. Löwis18e16552006-02-15 17:27:45 +00006385 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386}
6387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006388PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006389"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006391Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006392at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393
6394static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006395unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396{
6397 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6398 register const Py_UNICODE *e;
6399 int cased;
6400
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 /* Shortcut for single character strings */
6402 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006403 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006405 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006406 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006407 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006408
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 e = p + PyUnicode_GET_SIZE(self);
6410 cased = 0;
6411 for (; p < e; p++) {
6412 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006413
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006415 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 else if (!cased && Py_UNICODE_ISLOWER(ch))
6417 cased = 1;
6418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006419 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420}
6421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006422PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006423"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006425Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006426at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
6428static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006429unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430{
6431 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6432 register const Py_UNICODE *e;
6433 int cased;
6434
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 /* Shortcut for single character strings */
6436 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006437 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006439 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006440 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006441 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006442
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 e = p + PyUnicode_GET_SIZE(self);
6444 cased = 0;
6445 for (; p < e; p++) {
6446 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006447
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006449 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 else if (!cased && Py_UNICODE_ISUPPER(ch))
6451 cased = 1;
6452 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006453 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454}
6455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006456PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006457"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006459Return True if S is a titlecased string and there is at least one\n\
6460character in S, i.e. upper- and titlecase characters may only\n\
6461follow uncased characters and lowercase characters only cased ones.\n\
6462Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463
6464static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006465unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466{
6467 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6468 register const Py_UNICODE *e;
6469 int cased, previous_is_cased;
6470
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 /* Shortcut for single character strings */
6472 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006473 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6474 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006476 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006477 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006478 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006479
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 e = p + PyUnicode_GET_SIZE(self);
6481 cased = 0;
6482 previous_is_cased = 0;
6483 for (; p < e; p++) {
6484 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006485
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6487 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006488 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 previous_is_cased = 1;
6490 cased = 1;
6491 }
6492 else if (Py_UNICODE_ISLOWER(ch)) {
6493 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006494 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 previous_is_cased = 1;
6496 cased = 1;
6497 }
6498 else
6499 previous_is_cased = 0;
6500 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006501 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502}
6503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006504PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006505"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006507Return True if all characters in S are whitespace\n\
6508and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509
6510static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006511unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512{
6513 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6514 register const Py_UNICODE *e;
6515
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 /* Shortcut for single character strings */
6517 if (PyUnicode_GET_SIZE(self) == 1 &&
6518 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006519 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006521 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006522 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006523 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006524
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 e = p + PyUnicode_GET_SIZE(self);
6526 for (; p < e; p++) {
6527 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006528 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006530 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531}
6532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006533PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006534"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006535\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006536Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006537and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006538
6539static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006540unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006541{
6542 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6543 register const Py_UNICODE *e;
6544
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006545 /* Shortcut for single character strings */
6546 if (PyUnicode_GET_SIZE(self) == 1 &&
6547 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006548 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006549
6550 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006551 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006552 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006553
6554 e = p + PyUnicode_GET_SIZE(self);
6555 for (; p < e; p++) {
6556 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006557 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006558 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006559 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006560}
6561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006562PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006563"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006564\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006565Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006566and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006567
6568static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006569unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006570{
6571 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6572 register const Py_UNICODE *e;
6573
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006574 /* Shortcut for single character strings */
6575 if (PyUnicode_GET_SIZE(self) == 1 &&
6576 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006577 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006578
6579 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006580 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006581 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006582
6583 e = p + PyUnicode_GET_SIZE(self);
6584 for (; p < e; p++) {
6585 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006586 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006587 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006588 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006589}
6590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006591PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006592"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006594Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006595False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
6597static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006598unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599{
6600 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6601 register const Py_UNICODE *e;
6602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 /* Shortcut for single character strings */
6604 if (PyUnicode_GET_SIZE(self) == 1 &&
6605 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006606 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006608 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006609 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006610 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006611
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 e = p + PyUnicode_GET_SIZE(self);
6613 for (; p < e; p++) {
6614 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006615 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006617 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618}
6619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006620PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006621"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006623Return True if all characters in S are digits\n\
6624and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625
6626static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006627unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
6629 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6630 register const Py_UNICODE *e;
6631
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 /* Shortcut for single character strings */
6633 if (PyUnicode_GET_SIZE(self) == 1 &&
6634 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006635 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006637 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006638 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006639 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006640
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 e = p + PyUnicode_GET_SIZE(self);
6642 for (; p < e; p++) {
6643 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006644 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006646 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006649PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006650"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006652Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006653False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654
6655static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006656unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657{
6658 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6659 register const Py_UNICODE *e;
6660
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 /* Shortcut for single character strings */
6662 if (PyUnicode_GET_SIZE(self) == 1 &&
6663 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006664 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006666 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006667 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006668 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006669
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 e = p + PyUnicode_GET_SIZE(self);
6671 for (; p < e; p++) {
6672 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006673 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006675 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676}
6677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006678PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679"S.join(sequence) -> unicode\n\
6680\n\
6681Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006682sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683
6684static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006685unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006687 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688}
6689
Martin v. Löwis18e16552006-02-15 17:27:45 +00006690static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691unicode_length(PyUnicodeObject *self)
6692{
6693 return self->length;
6694}
6695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006696PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006697"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698\n\
6699Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006700done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701
6702static PyObject *
6703unicode_ljust(PyUnicodeObject *self, PyObject *args)
6704{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006705 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006706 Py_UNICODE fillchar = ' ';
6707
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006708 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 return NULL;
6710
Tim Peters7a29bd52001-09-12 03:03:31 +00006711 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 Py_INCREF(self);
6713 return (PyObject*) self;
6714 }
6715
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006716 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006719PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720"S.lower() -> unicode\n\
6721\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006722Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723
6724static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006725unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 return fixup(self, fixlower);
6728}
6729
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006730#define LEFTSTRIP 0
6731#define RIGHTSTRIP 1
6732#define BOTHSTRIP 2
6733
6734/* Arrays indexed by above */
6735static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6736
6737#define STRIPNAME(i) (stripformat[i]+3)
6738
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006739/* externally visible for str.strip(unicode) */
6740PyObject *
6741_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6742{
6743 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006744 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006745 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006746 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6747 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006748
Thomas Wouters477c8d52006-05-27 19:21:47 +00006749 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6750
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006751 i = 0;
6752 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006753 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6754 i++;
6755 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006756 }
6757
6758 j = len;
6759 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006760 do {
6761 j--;
6762 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6763 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006764 }
6765
6766 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006767 Py_INCREF(self);
6768 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006769 }
6770 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00006771 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006772}
6773
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774
6775static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006776do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006778 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006779 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006780
6781 i = 0;
6782 if (striptype != RIGHTSTRIP) {
6783 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6784 i++;
6785 }
6786 }
6787
6788 j = len;
6789 if (striptype != LEFTSTRIP) {
6790 do {
6791 j--;
6792 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6793 j++;
6794 }
6795
6796 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6797 Py_INCREF(self);
6798 return (PyObject*)self;
6799 }
6800 else
6801 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802}
6803
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006804
6805static PyObject *
6806do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6807{
6808 PyObject *sep = NULL;
6809
6810 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6811 return NULL;
6812
6813 if (sep != NULL && sep != Py_None) {
6814 if (PyUnicode_Check(sep))
6815 return _PyUnicode_XStrip(self, striptype, sep);
6816 else if (PyString_Check(sep)) {
6817 PyObject *res;
6818 sep = PyUnicode_FromObject(sep);
6819 if (sep==NULL)
6820 return NULL;
6821 res = _PyUnicode_XStrip(self, striptype, sep);
6822 Py_DECREF(sep);
6823 return res;
6824 }
6825 else {
6826 PyErr_Format(PyExc_TypeError,
6827 "%s arg must be None, unicode or str",
6828 STRIPNAME(striptype));
6829 return NULL;
6830 }
6831 }
6832
6833 return do_strip(self, striptype);
6834}
6835
6836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006837PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006838"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006839\n\
6840Return a copy of the string S with leading and trailing\n\
6841whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006842If chars is given and not None, remove characters in chars instead.\n\
6843If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006844
6845static PyObject *
6846unicode_strip(PyUnicodeObject *self, PyObject *args)
6847{
6848 if (PyTuple_GET_SIZE(args) == 0)
6849 return do_strip(self, BOTHSTRIP); /* Common case */
6850 else
6851 return do_argstrip(self, BOTHSTRIP, args);
6852}
6853
6854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006855PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006856"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006857\n\
6858Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006859If chars is given and not None, remove characters in chars instead.\n\
6860If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006861
6862static PyObject *
6863unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6864{
6865 if (PyTuple_GET_SIZE(args) == 0)
6866 return do_strip(self, LEFTSTRIP); /* Common case */
6867 else
6868 return do_argstrip(self, LEFTSTRIP, args);
6869}
6870
6871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006872PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006873"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006874\n\
6875Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006876If chars is given and not None, remove characters in chars instead.\n\
6877If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006878
6879static PyObject *
6880unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6881{
6882 if (PyTuple_GET_SIZE(args) == 0)
6883 return do_strip(self, RIGHTSTRIP); /* Common case */
6884 else
6885 return do_argstrip(self, RIGHTSTRIP, args);
6886}
6887
6888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006890unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891{
6892 PyUnicodeObject *u;
6893 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006894 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006895 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
6897 if (len < 0)
6898 len = 0;
6899
Tim Peters7a29bd52001-09-12 03:03:31 +00006900 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 /* no repeat, return original string */
6902 Py_INCREF(str);
6903 return (PyObject*) str;
6904 }
Tim Peters8f422462000-09-09 06:13:41 +00006905
6906 /* ensure # of chars needed doesn't overflow int and # of bytes
6907 * needed doesn't overflow size_t
6908 */
6909 nchars = len * str->length;
6910 if (len && nchars / len != str->length) {
6911 PyErr_SetString(PyExc_OverflowError,
6912 "repeated string is too long");
6913 return NULL;
6914 }
6915 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6916 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6917 PyErr_SetString(PyExc_OverflowError,
6918 "repeated string is too long");
6919 return NULL;
6920 }
6921 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 if (!u)
6923 return NULL;
6924
6925 p = u->str;
6926
Thomas Wouters477c8d52006-05-27 19:21:47 +00006927 if (str->length == 1 && len > 0) {
6928 Py_UNICODE_FILL(p, str->str[0], len);
6929 } else {
6930 Py_ssize_t done = 0; /* number of characters copied this far */
6931 if (done < nchars) {
6932 Py_UNICODE_COPY(p, str->str, str->length);
6933 done = str->length;
6934 }
6935 while (done < nchars) {
6936 int n = (done <= nchars-done) ? done : nchars-done;
6937 Py_UNICODE_COPY(p+done, p, n);
6938 done += n;
6939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 }
6941
6942 return (PyObject*) u;
6943}
6944
6945PyObject *PyUnicode_Replace(PyObject *obj,
6946 PyObject *subobj,
6947 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006948 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949{
6950 PyObject *self;
6951 PyObject *str1;
6952 PyObject *str2;
6953 PyObject *result;
6954
6955 self = PyUnicode_FromObject(obj);
6956 if (self == NULL)
6957 return NULL;
6958 str1 = PyUnicode_FromObject(subobj);
6959 if (str1 == NULL) {
6960 Py_DECREF(self);
6961 return NULL;
6962 }
6963 str2 = PyUnicode_FromObject(replobj);
6964 if (str2 == NULL) {
6965 Py_DECREF(self);
6966 Py_DECREF(str1);
6967 return NULL;
6968 }
Tim Petersced69f82003-09-16 20:30:58 +00006969 result = replace((PyUnicodeObject *)self,
6970 (PyUnicodeObject *)str1,
6971 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 maxcount);
6973 Py_DECREF(self);
6974 Py_DECREF(str1);
6975 Py_DECREF(str2);
6976 return result;
6977}
6978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006979PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980"S.replace (old, new[, maxsplit]) -> unicode\n\
6981\n\
6982Return a copy of S with all occurrences of substring\n\
6983old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006984given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985
6986static PyObject*
6987unicode_replace(PyUnicodeObject *self, PyObject *args)
6988{
6989 PyUnicodeObject *str1;
6990 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006991 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 PyObject *result;
6993
Martin v. Löwis18e16552006-02-15 17:27:45 +00006994 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 return NULL;
6996 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6997 if (str1 == NULL)
6998 return NULL;
6999 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007000 if (str2 == NULL) {
7001 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004
7005 result = replace(self, str1, str2, maxcount);
7006
7007 Py_DECREF(str1);
7008 Py_DECREF(str2);
7009 return result;
7010}
7011
7012static
7013PyObject *unicode_repr(PyObject *unicode)
7014{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007015 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007016 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007017 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7018 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7019
7020 /* XXX(nnorwitz): rather than over-allocating, it would be
7021 better to choose a different scheme. Perhaps scan the
7022 first N-chars of the string and allocate based on that size.
7023 */
7024 /* Initial allocation is based on the longest-possible unichr
7025 escape.
7026
7027 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7028 unichr, so in this case it's the longest unichr escape. In
7029 narrow (UTF-16) builds this is five chars per source unichr
7030 since there are two unichrs in the surrogate pair, so in narrow
7031 (UTF-16) builds it's not the longest unichr escape.
7032
7033 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7034 so in the narrow (UTF-16) build case it's the longest unichr
7035 escape.
7036 */
7037
Walter Dörwald1ab83302007-05-18 17:15:44 +00007038 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007039 2 /* quotes */
7040#ifdef Py_UNICODE_WIDE
7041 + 10*size
7042#else
7043 + 6*size
7044#endif
7045 + 1);
7046 if (repr == NULL)
7047 return NULL;
7048
Walter Dörwald1ab83302007-05-18 17:15:44 +00007049 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007050
7051 /* Add quote */
7052 *p++ = (findchar(s, size, '\'') &&
7053 !findchar(s, size, '"')) ? '"' : '\'';
7054 while (size-- > 0) {
7055 Py_UNICODE ch = *s++;
7056
7057 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007058 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007059 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007060 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007061 continue;
7062 }
7063
7064#ifdef Py_UNICODE_WIDE
7065 /* Map 21-bit characters to '\U00xxxxxx' */
7066 else if (ch >= 0x10000) {
7067 *p++ = '\\';
7068 *p++ = 'U';
7069 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7070 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7071 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7072 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7073 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7074 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7075 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7076 *p++ = hexdigits[ch & 0x0000000F];
7077 continue;
7078 }
7079#else
7080 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7081 else if (ch >= 0xD800 && ch < 0xDC00) {
7082 Py_UNICODE ch2;
7083 Py_UCS4 ucs;
7084
7085 ch2 = *s++;
7086 size--;
7087 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7088 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7089 *p++ = '\\';
7090 *p++ = 'U';
7091 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7092 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7093 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7094 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7095 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7096 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7097 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7098 *p++ = hexdigits[ucs & 0x0000000F];
7099 continue;
7100 }
7101 /* Fall through: isolated surrogates are copied as-is */
7102 s--;
7103 size++;
7104 }
7105#endif
7106
7107 /* Map 16-bit characters to '\uxxxx' */
7108 if (ch >= 256) {
7109 *p++ = '\\';
7110 *p++ = 'u';
7111 *p++ = hexdigits[(ch >> 12) & 0x000F];
7112 *p++ = hexdigits[(ch >> 8) & 0x000F];
7113 *p++ = hexdigits[(ch >> 4) & 0x000F];
7114 *p++ = hexdigits[ch & 0x000F];
7115 }
7116
7117 /* Map special whitespace to '\t', \n', '\r' */
7118 else if (ch == '\t') {
7119 *p++ = '\\';
7120 *p++ = 't';
7121 }
7122 else if (ch == '\n') {
7123 *p++ = '\\';
7124 *p++ = 'n';
7125 }
7126 else if (ch == '\r') {
7127 *p++ = '\\';
7128 *p++ = 'r';
7129 }
7130
7131 /* Map non-printable US ASCII to '\xhh' */
7132 else if (ch < ' ' || ch >= 0x7F) {
7133 *p++ = '\\';
7134 *p++ = 'x';
7135 *p++ = hexdigits[(ch >> 4) & 0x000F];
7136 *p++ = hexdigits[ch & 0x000F];
7137 }
7138
7139 /* Copy everything else as-is */
7140 else
7141 *p++ = (char) ch;
7142 }
7143 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007144 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007145
7146 *p = '\0';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007147 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007148 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149}
7150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007151PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152"S.rfind(sub [,start [,end]]) -> int\n\
7153\n\
7154Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007155such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156arguments start and end are interpreted as in slice notation.\n\
7157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007158Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159
7160static PyObject *
7161unicode_rfind(PyUnicodeObject *self, PyObject *args)
7162{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007163 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007164 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007165 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007166 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167
Guido van Rossumb8872e62000-05-09 14:14:27 +00007168 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7169 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007171 substring = PyUnicode_FromObject(substring);
7172 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 return NULL;
7174
Thomas Wouters477c8d52006-05-27 19:21:47 +00007175 result = stringlib_rfind_slice(
7176 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7177 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7178 start, end
7179 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180
7181 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007182
7183 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184}
7185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007186PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187"S.rindex(sub [,start [,end]]) -> int\n\
7188\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007189Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190
7191static PyObject *
7192unicode_rindex(PyUnicodeObject *self, PyObject *args)
7193{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007194 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007195 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007196 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007197 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198
Guido van Rossumb8872e62000-05-09 14:14:27 +00007199 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7200 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007202 substring = PyUnicode_FromObject(substring);
7203 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 return NULL;
7205
Thomas Wouters477c8d52006-05-27 19:21:47 +00007206 result = stringlib_rfind_slice(
7207 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7208 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7209 start, end
7210 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211
7212 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007213
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 if (result < 0) {
7215 PyErr_SetString(PyExc_ValueError, "substring not found");
7216 return NULL;
7217 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007218 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219}
7220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007221PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007222"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223\n\
7224Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007225done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226
7227static PyObject *
7228unicode_rjust(PyUnicodeObject *self, PyObject *args)
7229{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007230 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007231 Py_UNICODE fillchar = ' ';
7232
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007233 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234 return NULL;
7235
Tim Peters7a29bd52001-09-12 03:03:31 +00007236 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 Py_INCREF(self);
7238 return (PyObject*) self;
7239 }
7240
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007241 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242}
7243
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007245unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246{
7247 /* standard clamping */
7248 if (start < 0)
7249 start = 0;
7250 if (end < 0)
7251 end = 0;
7252 if (end > self->length)
7253 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007254 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 /* full slice, return original string */
7256 Py_INCREF(self);
7257 return (PyObject*) self;
7258 }
7259 if (start > end)
7260 start = end;
7261 /* copy slice */
7262 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7263 end - start);
7264}
7265
7266PyObject *PyUnicode_Split(PyObject *s,
7267 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007268 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269{
7270 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007271
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 s = PyUnicode_FromObject(s);
7273 if (s == NULL)
7274 return NULL;
7275 if (sep != NULL) {
7276 sep = PyUnicode_FromObject(sep);
7277 if (sep == NULL) {
7278 Py_DECREF(s);
7279 return NULL;
7280 }
7281 }
7282
7283 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7284
7285 Py_DECREF(s);
7286 Py_XDECREF(sep);
7287 return result;
7288}
7289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007290PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291"S.split([sep [,maxsplit]]) -> list of strings\n\
7292\n\
7293Return a list of the words in S, using sep as the\n\
7294delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00007295splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00007296any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298static PyObject*
7299unicode_split(PyUnicodeObject *self, PyObject *args)
7300{
7301 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007302 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303
Martin v. Löwis18e16552006-02-15 17:27:45 +00007304 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 return NULL;
7306
7307 if (substring == Py_None)
7308 return split(self, NULL, maxcount);
7309 else if (PyUnicode_Check(substring))
7310 return split(self, (PyUnicodeObject *)substring, maxcount);
7311 else
7312 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7313}
7314
Thomas Wouters477c8d52006-05-27 19:21:47 +00007315PyObject *
7316PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7317{
7318 PyObject* str_obj;
7319 PyObject* sep_obj;
7320 PyObject* out;
7321
7322 str_obj = PyUnicode_FromObject(str_in);
7323 if (!str_obj)
7324 return NULL;
7325 sep_obj = PyUnicode_FromObject(sep_in);
7326 if (!sep_obj) {
7327 Py_DECREF(str_obj);
7328 return NULL;
7329 }
7330
7331 out = stringlib_partition(
7332 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7333 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7334 );
7335
7336 Py_DECREF(sep_obj);
7337 Py_DECREF(str_obj);
7338
7339 return out;
7340}
7341
7342
7343PyObject *
7344PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7345{
7346 PyObject* str_obj;
7347 PyObject* sep_obj;
7348 PyObject* out;
7349
7350 str_obj = PyUnicode_FromObject(str_in);
7351 if (!str_obj)
7352 return NULL;
7353 sep_obj = PyUnicode_FromObject(sep_in);
7354 if (!sep_obj) {
7355 Py_DECREF(str_obj);
7356 return NULL;
7357 }
7358
7359 out = stringlib_rpartition(
7360 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7361 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7362 );
7363
7364 Py_DECREF(sep_obj);
7365 Py_DECREF(str_obj);
7366
7367 return out;
7368}
7369
7370PyDoc_STRVAR(partition__doc__,
7371"S.partition(sep) -> (head, sep, tail)\n\
7372\n\
7373Searches for the separator sep in S, and returns the part before it,\n\
7374the separator itself, and the part after it. If the separator is not\n\
7375found, returns S and two empty strings.");
7376
7377static PyObject*
7378unicode_partition(PyUnicodeObject *self, PyObject *separator)
7379{
7380 return PyUnicode_Partition((PyObject *)self, separator);
7381}
7382
7383PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00007384"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007385\n\
7386Searches for the separator sep in S, starting at the end of S, and returns\n\
7387the part before it, the separator itself, and the part after it. If the\n\
Thomas Wouters89f507f2006-12-13 04:49:30 +00007388separator is not found, returns two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00007389
7390static PyObject*
7391unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7392{
7393 return PyUnicode_RPartition((PyObject *)self, separator);
7394}
7395
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007396PyObject *PyUnicode_RSplit(PyObject *s,
7397 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007398 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007399{
7400 PyObject *result;
7401
7402 s = PyUnicode_FromObject(s);
7403 if (s == NULL)
7404 return NULL;
7405 if (sep != NULL) {
7406 sep = PyUnicode_FromObject(sep);
7407 if (sep == NULL) {
7408 Py_DECREF(s);
7409 return NULL;
7410 }
7411 }
7412
7413 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7414
7415 Py_DECREF(s);
7416 Py_XDECREF(sep);
7417 return result;
7418}
7419
7420PyDoc_STRVAR(rsplit__doc__,
7421"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7422\n\
7423Return a list of the words in S, using sep as the\n\
7424delimiter string, starting at the end of the string and\n\
7425working to the front. If maxsplit is given, at most maxsplit\n\
7426splits are done. If sep is not specified, any whitespace string\n\
7427is a separator.");
7428
7429static PyObject*
7430unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7431{
7432 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007433 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007434
Martin v. Löwis18e16552006-02-15 17:27:45 +00007435 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007436 return NULL;
7437
7438 if (substring == Py_None)
7439 return rsplit(self, NULL, maxcount);
7440 else if (PyUnicode_Check(substring))
7441 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7442 else
7443 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7444}
7445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007446PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007447"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448\n\
7449Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007450Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007451is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452
7453static PyObject*
7454unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7455{
Guido van Rossum86662912000-04-11 15:38:46 +00007456 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457
Guido van Rossum86662912000-04-11 15:38:46 +00007458 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 return NULL;
7460
Guido van Rossum86662912000-04-11 15:38:46 +00007461 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462}
7463
7464static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007465PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466{
Walter Dörwald346737f2007-05-31 10:44:43 +00007467 if (PyUnicode_CheckExact(self)) {
7468 Py_INCREF(self);
7469 return self;
7470 } else
7471 /* Subtype -- return genuine unicode string with the same value. */
7472 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7473 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474}
7475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007476PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477"S.swapcase() -> unicode\n\
7478\n\
7479Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007480and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481
7482static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007483unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 return fixup(self, fixswapcase);
7486}
7487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007488PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489"S.translate(table) -> unicode\n\
7490\n\
7491Return a copy of the string S, where all characters have been mapped\n\
7492through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007493Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7494Unmapped characters are left untouched. Characters mapped to None\n\
7495are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496
7497static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007498unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499{
Tim Petersced69f82003-09-16 20:30:58 +00007500 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007502 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 "ignore");
7504}
7505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007506PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507"S.upper() -> unicode\n\
7508\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007509Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510
7511static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007512unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 return fixup(self, fixupper);
7515}
7516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007517PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518"S.zfill(width) -> unicode\n\
7519\n\
7520Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007521of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522
7523static PyObject *
7524unicode_zfill(PyUnicodeObject *self, PyObject *args)
7525{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007526 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 PyUnicodeObject *u;
7528
Martin v. Löwis18e16552006-02-15 17:27:45 +00007529 Py_ssize_t width;
7530 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 return NULL;
7532
7533 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007534 if (PyUnicode_CheckExact(self)) {
7535 Py_INCREF(self);
7536 return (PyObject*) self;
7537 }
7538 else
7539 return PyUnicode_FromUnicode(
7540 PyUnicode_AS_UNICODE(self),
7541 PyUnicode_GET_SIZE(self)
7542 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 }
7544
7545 fill = width - self->length;
7546
7547 u = pad(self, fill, 0, '0');
7548
Walter Dörwald068325e2002-04-15 13:36:47 +00007549 if (u == NULL)
7550 return NULL;
7551
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 if (u->str[fill] == '+' || u->str[fill] == '-') {
7553 /* move sign to beginning of string */
7554 u->str[0] = u->str[fill];
7555 u->str[fill] = '0';
7556 }
7557
7558 return (PyObject*) u;
7559}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560
7561#if 0
7562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007563unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 return PyInt_FromLong(unicode_freelist_size);
7566}
7567#endif
7568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007569PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007570"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007572Return True if S starts with the specified prefix, False otherwise.\n\
7573With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007574With optional end, stop comparing S at that position.\n\
7575prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
7577static PyObject *
7578unicode_startswith(PyUnicodeObject *self,
7579 PyObject *args)
7580{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007581 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007583 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007584 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007585 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007587 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007588 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007590 if (PyTuple_Check(subobj)) {
7591 Py_ssize_t i;
7592 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7593 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7594 PyTuple_GET_ITEM(subobj, i));
7595 if (substring == NULL)
7596 return NULL;
7597 result = tailmatch(self, substring, start, end, -1);
7598 Py_DECREF(substring);
7599 if (result) {
7600 Py_RETURN_TRUE;
7601 }
7602 }
7603 /* nothing matched */
7604 Py_RETURN_FALSE;
7605 }
7606 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007608 return NULL;
7609 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007611 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612}
7613
7614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007616"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007618Return True if S ends with the specified suffix, False otherwise.\n\
7619With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007620With optional end, stop comparing S at that position.\n\
7621suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622
7623static PyObject *
7624unicode_endswith(PyUnicodeObject *self,
7625 PyObject *args)
7626{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007627 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007629 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007630 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007631 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007633 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7634 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636 if (PyTuple_Check(subobj)) {
7637 Py_ssize_t i;
7638 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7639 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7640 PyTuple_GET_ITEM(subobj, i));
7641 if (substring == NULL)
7642 return NULL;
7643 result = tailmatch(self, substring, start, end, +1);
7644 Py_DECREF(substring);
7645 if (result) {
7646 Py_RETURN_TRUE;
7647 }
7648 }
7649 Py_RETURN_FALSE;
7650 }
7651 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007655 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658}
7659
7660
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007661
7662static PyObject *
7663unicode_getnewargs(PyUnicodeObject *v)
7664{
7665 return Py_BuildValue("(u#)", v->str, v->length);
7666}
7667
7668
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669static PyMethodDef unicode_methods[] = {
7670
7671 /* Order is according to common usage: often used methods should
7672 appear first, since lookup is done sequentially. */
7673
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007674 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7675 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7676 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007677 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007678 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7679 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7680 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7681 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7682 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7683 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7684 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007685 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007686 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7687 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7688 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007689 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007690 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007691/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7692 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7693 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7694 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007695 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00007696 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007697 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007698 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007699 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7700 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7701 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7702 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7703 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7704 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7705 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7706 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7707 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7708 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7709 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7710 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7711 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7712 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007713 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007714#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007715 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716#endif
7717
7718#if 0
7719 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007720 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721#endif
7722
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007723 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 {NULL, NULL}
7725};
7726
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007727static PyObject *
7728unicode_mod(PyObject *v, PyObject *w)
7729{
7730 if (!PyUnicode_Check(v)) {
7731 Py_INCREF(Py_NotImplemented);
7732 return Py_NotImplemented;
7733 }
7734 return PyUnicode_Format(v, w);
7735}
7736
7737static PyNumberMethods unicode_as_number = {
7738 0, /*nb_add*/
7739 0, /*nb_subtract*/
7740 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007741 unicode_mod, /*nb_remainder*/
7742};
7743
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007745 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007746 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007747 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7748 (ssizeargfunc) unicode_getitem, /* sq_item */
7749 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 0, /* sq_ass_item */
7751 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007752 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753};
7754
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007755static PyObject*
7756unicode_subscript(PyUnicodeObject* self, PyObject* item)
7757{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007758 if (PyIndex_Check(item)) {
7759 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007760 if (i == -1 && PyErr_Occurred())
7761 return NULL;
7762 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007763 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007764 return unicode_getitem(self, i);
7765 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007766 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007767 Py_UNICODE* source_buf;
7768 Py_UNICODE* result_buf;
7769 PyObject* result;
7770
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007771 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007772 &start, &stop, &step, &slicelength) < 0) {
7773 return NULL;
7774 }
7775
7776 if (slicelength <= 0) {
7777 return PyUnicode_FromUnicode(NULL, 0);
7778 } else {
7779 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007780 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7781 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007782
7783 if (result_buf == NULL)
7784 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007785
7786 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7787 result_buf[i] = source_buf[cur];
7788 }
Tim Petersced69f82003-09-16 20:30:58 +00007789
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007790 result = PyUnicode_FromUnicode(result_buf, slicelength);
7791 PyMem_FREE(result_buf);
7792 return result;
7793 }
7794 } else {
7795 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7796 return NULL;
7797 }
7798}
7799
7800static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007802 (binaryfunc)unicode_subscript, /* mp_subscript */
7803 (objobjargproc)0, /* mp_ass_subscript */
7804};
7805
Martin v. Löwis18e16552006-02-15 17:27:45 +00007806static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007808 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809 const void **ptr)
7810{
7811 if (index != 0) {
7812 PyErr_SetString(PyExc_SystemError,
7813 "accessing non-existent unicode segment");
7814 return -1;
7815 }
7816 *ptr = (void *) self->str;
7817 return PyUnicode_GET_DATA_SIZE(self);
7818}
7819
Martin v. Löwis18e16552006-02-15 17:27:45 +00007820static Py_ssize_t
7821unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 const void **ptr)
7823{
7824 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007825 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 return -1;
7827}
7828
7829static int
7830unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007831 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832{
7833 if (lenp)
7834 *lenp = PyUnicode_GET_DATA_SIZE(self);
7835 return 1;
7836}
7837
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007838static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007840 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 const void **ptr)
7842{
7843 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007844
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 if (index != 0) {
7846 PyErr_SetString(PyExc_SystemError,
7847 "accessing non-existent unicode segment");
7848 return -1;
7849 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007850 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 if (str == NULL)
7852 return -1;
7853 *ptr = (void *) PyString_AS_STRING(str);
7854 return PyString_GET_SIZE(str);
7855}
7856
7857/* Helpers for PyUnicode_Format() */
7858
7859static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007860getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007862 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863 if (argidx < arglen) {
7864 (*p_argidx)++;
7865 if (arglen < 0)
7866 return args;
7867 else
7868 return PyTuple_GetItem(args, argidx);
7869 }
7870 PyErr_SetString(PyExc_TypeError,
7871 "not enough arguments for format string");
7872 return NULL;
7873}
7874
7875#define F_LJUST (1<<0)
7876#define F_SIGN (1<<1)
7877#define F_BLANK (1<<2)
7878#define F_ALT (1<<3)
7879#define F_ZERO (1<<4)
7880
Martin v. Löwis18e16552006-02-15 17:27:45 +00007881static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007882strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007884 register Py_ssize_t i;
7885 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 for (i = len - 1; i >= 0; i--)
7887 buffer[i] = (Py_UNICODE) charbuffer[i];
7888
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889 return len;
7890}
7891
Neal Norwitzfc76d632006-01-10 06:03:13 +00007892static int
7893doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7894{
Tim Peters15231542006-02-16 01:08:01 +00007895 Py_ssize_t result;
7896
Neal Norwitzfc76d632006-01-10 06:03:13 +00007897 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007898 result = strtounicode(buffer, (char *)buffer);
7899 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007900}
7901
7902static int
7903longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7904{
Tim Peters15231542006-02-16 01:08:01 +00007905 Py_ssize_t result;
7906
Neal Norwitzfc76d632006-01-10 06:03:13 +00007907 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007908 result = strtounicode(buffer, (char *)buffer);
7909 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007910}
7911
Guido van Rossum078151d2002-08-11 04:24:12 +00007912/* XXX To save some code duplication, formatfloat/long/int could have been
7913 shared with stringobject.c, converting from 8-bit to Unicode after the
7914 formatting is done. */
7915
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916static int
7917formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007918 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 int flags,
7920 int prec,
7921 int type,
7922 PyObject *v)
7923{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007924 /* fmt = '%#.' + `prec` + `type`
7925 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 char fmt[20];
7927 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007928
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929 x = PyFloat_AsDouble(v);
7930 if (x == -1.0 && PyErr_Occurred())
7931 return -1;
7932 if (prec < 0)
7933 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007934 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7935 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007936 /* Worst case length calc to ensure no buffer overrun:
7937
7938 'g' formats:
7939 fmt = %#.<prec>g
7940 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7941 for any double rep.)
7942 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7943
7944 'f' formats:
7945 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7946 len = 1 + 50 + 1 + prec = 52 + prec
7947
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007948 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007949 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007950
7951 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00007952 if (((type == 'g' || type == 'G') &&
7953 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007954 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007955 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007956 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007957 return -1;
7958 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007959 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7960 (flags&F_ALT) ? "#" : "",
7961 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007962 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963}
7964
Tim Peters38fd5b62000-09-21 05:43:11 +00007965static PyObject*
7966formatlong(PyObject *val, int flags, int prec, int type)
7967{
7968 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007969 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00007970 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00007971 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007972
7973 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7974 if (!str)
7975 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00007976 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00007977 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00007978 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00007979}
7980
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981static int
7982formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007983 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 int flags,
7985 int prec,
7986 int type,
7987 PyObject *v)
7988{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007989 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007990 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7991 * + 1 + 1
7992 * = 24
7993 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007994 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007995 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 long x;
7997
7998 x = PyInt_AsLong(v);
7999 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008000 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008001 if (x < 0 && type == 'u') {
8002 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008003 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008004 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8005 sign = "-";
8006 else
8007 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008009 prec = 1;
8010
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008011 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8012 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008013 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008014 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008015 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008016 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008017 return -1;
8018 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008019
8020 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008021 (type == 'x' || type == 'X' || type == 'o')) {
8022 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008023 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008024 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008025 * - when 0 is being converted, the C standard leaves off
8026 * the '0x' or '0X', which is inconsistent with other
8027 * %#x/%#X conversions and inconsistent with Python's
8028 * hex() function
8029 * - there are platforms that violate the standard and
8030 * convert 0 with the '0x' or '0X'
8031 * (Metrowerks, Compaq Tru64)
8032 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008033 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008034 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008035 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008036 * We can achieve the desired consistency by inserting our
8037 * own '0x' or '0X' prefix, and substituting %x/%X in place
8038 * of %#x/%#X.
8039 *
8040 * Note that this is the same approach as used in
8041 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008042 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008043 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8044 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008045 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008046 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008047 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8048 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008049 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008050 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008051 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008052 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008053 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008054 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055}
8056
8057static int
8058formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008059 size_t buflen,
8060 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008062 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008063 if (PyUnicode_Check(v)) {
8064 if (PyUnicode_GET_SIZE(v) != 1)
8065 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008069 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00008070 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008071 goto onError;
8072 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074
8075 else {
8076 /* Integer input truncated to a character */
8077 long x;
8078 x = PyInt_AsLong(v);
8079 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008080 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008081#ifdef Py_UNICODE_WIDE
8082 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008083 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008084 "%c arg not in range(0x110000) "
8085 "(wide Python build)");
8086 return -1;
8087 }
8088#else
8089 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008090 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008091 "%c arg not in range(0x10000) "
8092 "(narrow Python build)");
8093 return -1;
8094 }
8095#endif
8096 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 }
8098 buf[1] = '\0';
8099 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008100
8101 onError:
8102 PyErr_SetString(PyExc_TypeError,
8103 "%c requires int or char");
8104 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105}
8106
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008107/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8108
8109 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8110 chars are formatted. XXX This is a magic number. Each formatting
8111 routine does bounds checking to ensure no overflow, but a better
8112 solution may be to malloc a buffer of appropriate size for each
8113 format. For now, the current solution is sufficient.
8114*/
8115#define FORMATBUFLEN (size_t)120
8116
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117PyObject *PyUnicode_Format(PyObject *format,
8118 PyObject *args)
8119{
8120 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008121 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 int args_owned = 0;
8123 PyUnicodeObject *result = NULL;
8124 PyObject *dict = NULL;
8125 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008126
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 if (format == NULL || args == NULL) {
8128 PyErr_BadInternalCall();
8129 return NULL;
8130 }
8131 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008132 if (uformat == NULL)
8133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 fmt = PyUnicode_AS_UNICODE(uformat);
8135 fmtcnt = PyUnicode_GET_SIZE(uformat);
8136
8137 reslen = rescnt = fmtcnt + 100;
8138 result = _PyUnicode_New(reslen);
8139 if (result == NULL)
8140 goto onError;
8141 res = PyUnicode_AS_UNICODE(result);
8142
8143 if (PyTuple_Check(args)) {
8144 arglen = PyTuple_Size(args);
8145 argidx = 0;
8146 }
8147 else {
8148 arglen = -1;
8149 argidx = -2;
8150 }
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008151 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008152 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 dict = args;
8154
8155 while (--fmtcnt >= 0) {
8156 if (*fmt != '%') {
8157 if (--rescnt < 0) {
8158 rescnt = fmtcnt + 100;
8159 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008160 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008161 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8163 --rescnt;
8164 }
8165 *res++ = *fmt++;
8166 }
8167 else {
8168 /* Got a format specifier */
8169 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008170 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 Py_UNICODE c = '\0';
8173 Py_UNICODE fill;
8174 PyObject *v = NULL;
8175 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008176 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008178 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008179 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180
8181 fmt++;
8182 if (*fmt == '(') {
8183 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008184 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 PyObject *key;
8186 int pcount = 1;
8187
8188 if (dict == NULL) {
8189 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00008190 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191 goto onError;
8192 }
8193 ++fmt;
8194 --fmtcnt;
8195 keystart = fmt;
8196 /* Skip over balanced parentheses */
8197 while (pcount > 0 && --fmtcnt >= 0) {
8198 if (*fmt == ')')
8199 --pcount;
8200 else if (*fmt == '(')
8201 ++pcount;
8202 fmt++;
8203 }
8204 keylen = fmt - keystart - 1;
8205 if (fmtcnt < 0 || pcount > 0) {
8206 PyErr_SetString(PyExc_ValueError,
8207 "incomplete format key");
8208 goto onError;
8209 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008210#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00008211 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 then looked up since Python uses strings to hold
8213 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00008214 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 key = PyUnicode_EncodeUTF8(keystart,
8216 keylen,
8217 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00008218#else
8219 key = PyUnicode_FromUnicode(keystart, keylen);
8220#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 if (key == NULL)
8222 goto onError;
8223 if (args_owned) {
8224 Py_DECREF(args);
8225 args_owned = 0;
8226 }
8227 args = PyObject_GetItem(dict, key);
8228 Py_DECREF(key);
8229 if (args == NULL) {
8230 goto onError;
8231 }
8232 args_owned = 1;
8233 arglen = -1;
8234 argidx = -2;
8235 }
8236 while (--fmtcnt >= 0) {
8237 switch (c = *fmt++) {
8238 case '-': flags |= F_LJUST; continue;
8239 case '+': flags |= F_SIGN; continue;
8240 case ' ': flags |= F_BLANK; continue;
8241 case '#': flags |= F_ALT; continue;
8242 case '0': flags |= F_ZERO; continue;
8243 }
8244 break;
8245 }
8246 if (c == '*') {
8247 v = getnextarg(args, arglen, &argidx);
8248 if (v == NULL)
8249 goto onError;
8250 if (!PyInt_Check(v)) {
8251 PyErr_SetString(PyExc_TypeError,
8252 "* wants int");
8253 goto onError;
8254 }
8255 width = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008256 if (width == -1 && PyErr_Occurred())
8257 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 if (width < 0) {
8259 flags |= F_LJUST;
8260 width = -width;
8261 }
8262 if (--fmtcnt >= 0)
8263 c = *fmt++;
8264 }
8265 else if (c >= '0' && c <= '9') {
8266 width = c - '0';
8267 while (--fmtcnt >= 0) {
8268 c = *fmt++;
8269 if (c < '0' || c > '9')
8270 break;
8271 if ((width*10) / 10 != width) {
8272 PyErr_SetString(PyExc_ValueError,
8273 "width too big");
8274 goto onError;
8275 }
8276 width = width*10 + (c - '0');
8277 }
8278 }
8279 if (c == '.') {
8280 prec = 0;
8281 if (--fmtcnt >= 0)
8282 c = *fmt++;
8283 if (c == '*') {
8284 v = getnextarg(args, arglen, &argidx);
8285 if (v == NULL)
8286 goto onError;
8287 if (!PyInt_Check(v)) {
8288 PyErr_SetString(PyExc_TypeError,
8289 "* wants int");
8290 goto onError;
8291 }
8292 prec = PyInt_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00008293 if (prec == -1 && PyErr_Occurred())
8294 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 if (prec < 0)
8296 prec = 0;
8297 if (--fmtcnt >= 0)
8298 c = *fmt++;
8299 }
8300 else if (c >= '0' && c <= '9') {
8301 prec = c - '0';
8302 while (--fmtcnt >= 0) {
8303 c = Py_CHARMASK(*fmt++);
8304 if (c < '0' || c > '9')
8305 break;
8306 if ((prec*10) / 10 != prec) {
8307 PyErr_SetString(PyExc_ValueError,
8308 "prec too big");
8309 goto onError;
8310 }
8311 prec = prec*10 + (c - '0');
8312 }
8313 }
8314 } /* prec */
8315 if (fmtcnt >= 0) {
8316 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 if (--fmtcnt >= 0)
8318 c = *fmt++;
8319 }
8320 }
8321 if (fmtcnt < 0) {
8322 PyErr_SetString(PyExc_ValueError,
8323 "incomplete format");
8324 goto onError;
8325 }
8326 if (c != '%') {
8327 v = getnextarg(args, arglen, &argidx);
8328 if (v == NULL)
8329 goto onError;
8330 }
8331 sign = 0;
8332 fill = ' ';
8333 switch (c) {
8334
8335 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008336 pbuf = formatbuf;
8337 /* presume that buffer length is at least 1 */
8338 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339 len = 1;
8340 break;
8341
8342 case 's':
8343 case 'r':
8344 if (PyUnicode_Check(v) && c == 's') {
8345 temp = v;
8346 Py_INCREF(temp);
8347 }
8348 else {
8349 PyObject *unicode;
8350 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008351 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 else
8353 temp = PyObject_Repr(v);
8354 if (temp == NULL)
8355 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008356 if (PyUnicode_Check(temp))
8357 /* nothing to do */;
8358 else if (PyString_Check(temp)) {
8359 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00008360 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00008362 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008364 Py_DECREF(temp);
8365 temp = unicode;
8366 if (temp == NULL)
8367 goto onError;
8368 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008369 else {
8370 Py_DECREF(temp);
8371 PyErr_SetString(PyExc_TypeError,
8372 "%s argument has non-string str()");
8373 goto onError;
8374 }
8375 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008376 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 len = PyUnicode_GET_SIZE(temp);
8378 if (prec >= 0 && len > prec)
8379 len = prec;
8380 break;
8381
8382 case 'i':
8383 case 'd':
8384 case 'u':
8385 case 'o':
8386 case 'x':
8387 case 'X':
8388 if (c == 'i')
8389 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008390 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008391 temp = formatlong(v, flags, prec, c);
8392 if (!temp)
8393 goto onError;
8394 pbuf = PyUnicode_AS_UNICODE(temp);
8395 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008396 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008398 else {
8399 pbuf = formatbuf;
8400 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8401 flags, prec, c, v);
8402 if (len < 0)
8403 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008404 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008405 }
8406 if (flags & F_ZERO)
8407 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408 break;
8409
8410 case 'e':
8411 case 'E':
8412 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008413 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414 case 'g':
8415 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008416 if (c == 'F')
8417 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008418 pbuf = formatbuf;
8419 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8420 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421 if (len < 0)
8422 goto onError;
8423 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008424 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 fill = '0';
8426 break;
8427
8428 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008429 pbuf = formatbuf;
8430 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 if (len < 0)
8432 goto onError;
8433 break;
8434
8435 default:
8436 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008437 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00008438 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008439 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008440 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008441 (Py_ssize_t)(fmt - 1 -
8442 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 goto onError;
8444 }
8445 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008446 if (*pbuf == '-' || *pbuf == '+') {
8447 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 len--;
8449 }
8450 else if (flags & F_SIGN)
8451 sign = '+';
8452 else if (flags & F_BLANK)
8453 sign = ' ';
8454 else
8455 sign = 0;
8456 }
8457 if (width < len)
8458 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008459 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460 reslen -= rescnt;
8461 rescnt = width + fmtcnt + 100;
8462 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008463 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008464 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008465 PyErr_NoMemory();
8466 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008467 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008468 if (_PyUnicode_Resize(&result, reslen) < 0) {
8469 Py_XDECREF(temp);
8470 goto onError;
8471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472 res = PyUnicode_AS_UNICODE(result)
8473 + reslen - rescnt;
8474 }
8475 if (sign) {
8476 if (fill != ' ')
8477 *res++ = sign;
8478 rescnt--;
8479 if (width > len)
8480 width--;
8481 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008482 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008483 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008484 assert(pbuf[1] == c);
8485 if (fill != ' ') {
8486 *res++ = *pbuf++;
8487 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008488 }
Tim Petersfff53252001-04-12 18:38:48 +00008489 rescnt -= 2;
8490 width -= 2;
8491 if (width < 0)
8492 width = 0;
8493 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 if (width > len && !(flags & F_LJUST)) {
8496 do {
8497 --rescnt;
8498 *res++ = fill;
8499 } while (--width > len);
8500 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008501 if (fill == ' ') {
8502 if (sign)
8503 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008504 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008505 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008506 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008507 *res++ = *pbuf++;
8508 *res++ = *pbuf++;
8509 }
8510 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008511 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 res += len;
8513 rescnt -= len;
8514 while (--width >= len) {
8515 --rescnt;
8516 *res++ = ' ';
8517 }
8518 if (dict && (argidx < arglen) && c != '%') {
8519 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008520 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008521 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 goto onError;
8523 }
8524 Py_XDECREF(temp);
8525 } /* '%' */
8526 } /* until end */
8527 if (argidx < arglen && !dict) {
8528 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008529 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 goto onError;
8531 }
8532
Thomas Woutersa96affe2006-03-12 00:29:36 +00008533 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8534 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 if (args_owned) {
8536 Py_DECREF(args);
8537 }
8538 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 return (PyObject *)result;
8540
8541 onError:
8542 Py_XDECREF(result);
8543 Py_DECREF(uformat);
8544 if (args_owned) {
8545 Py_DECREF(args);
8546 }
8547 return NULL;
8548}
8549
8550static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008551 (readbufferproc) unicode_buffer_getreadbuf,
8552 (writebufferproc) unicode_buffer_getwritebuf,
8553 (segcountproc) unicode_buffer_getsegcount,
8554 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555};
8556
Jeremy Hylton938ace62002-07-17 16:30:39 +00008557static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008558unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8559
Tim Peters6d6c1a32001-08-02 04:15:00 +00008560static PyObject *
8561unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8562{
8563 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00008564 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008565 char *encoding = NULL;
8566 char *errors = NULL;
8567
Guido van Rossume023fe02001-08-30 03:12:59 +00008568 if (type != &PyUnicode_Type)
8569 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008570 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8571 kwlist, &x, &encoding, &errors))
8572 return NULL;
8573 if (x == NULL)
8574 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008575 if (encoding == NULL && errors == NULL)
8576 return PyObject_Unicode(x);
8577 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008578 return PyUnicode_FromEncodedObject(x, encoding, errors);
8579}
8580
Guido van Rossume023fe02001-08-30 03:12:59 +00008581static PyObject *
8582unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8583{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008584 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008585 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008586
8587 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8588 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8589 if (tmp == NULL)
8590 return NULL;
8591 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008592 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008593 if (pnew == NULL) {
8594 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008595 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008596 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008597 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8598 if (pnew->str == NULL) {
8599 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008600 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008601 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008602 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008603 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008604 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8605 pnew->length = n;
8606 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008607 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008608 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008609}
8610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008611PyDoc_STRVAR(unicode_doc,
Collin Winterd474ce82007-08-07 19:42:11 +00008612"str(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008613\n\
Collin Winterd474ce82007-08-07 19:42:11 +00008614Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008615encoding defaults to the current default string encoding.\n\
8616errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008617
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008618static PyObject *unicode_iter(PyObject *seq);
8619
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008621 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00008622 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 sizeof(PyUnicodeObject), /* tp_size */
8624 0, /* tp_itemsize */
8625 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008626 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008628 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008630 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008631 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008632 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008634 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 (hashfunc) unicode_hash, /* tp_hash*/
8636 0, /* tp_call*/
8637 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008638 PyObject_GenericGetAttr, /* tp_getattro */
8639 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 &unicode_as_buffer, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00008641 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8642 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008643 unicode_doc, /* tp_doc */
8644 0, /* tp_traverse */
8645 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008646 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008647 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008648 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008649 0, /* tp_iternext */
8650 unicode_methods, /* tp_methods */
8651 0, /* tp_members */
8652 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008653 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008654 0, /* tp_dict */
8655 0, /* tp_descr_get */
8656 0, /* tp_descr_set */
8657 0, /* tp_dictoffset */
8658 0, /* tp_init */
8659 0, /* tp_alloc */
8660 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008661 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662};
8663
8664/* Initialize the Unicode implementation */
8665
Thomas Wouters78890102000-07-22 19:25:51 +00008666void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008668 int i;
8669
Thomas Wouters477c8d52006-05-27 19:21:47 +00008670 /* XXX - move this array to unicodectype.c ? */
8671 Py_UNICODE linebreak[] = {
8672 0x000A, /* LINE FEED */
8673 0x000D, /* CARRIAGE RETURN */
8674 0x001C, /* FILE SEPARATOR */
8675 0x001D, /* GROUP SEPARATOR */
8676 0x001E, /* RECORD SEPARATOR */
8677 0x0085, /* NEXT LINE */
8678 0x2028, /* LINE SEPARATOR */
8679 0x2029, /* PARAGRAPH SEPARATOR */
8680 };
8681
Fred Drakee4315f52000-05-09 19:53:39 +00008682 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008683 unicode_freelist = NULL;
8684 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008686 if (!unicode_empty)
8687 return;
8688
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008689 for (i = 0; i < 256; i++)
8690 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008691 if (PyType_Ready(&PyUnicode_Type) < 0)
8692 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008693
8694 /* initialize the linebreak bloom filter */
8695 bloom_linebreak = make_bloom_mask(
8696 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8697 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008698
8699 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700}
8701
8702/* Finalize the Unicode implementation */
8703
8704void
Thomas Wouters78890102000-07-22 19:25:51 +00008705_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008707 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008708 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008710 Py_XDECREF(unicode_empty);
8711 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008712
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008713 for (i = 0; i < 256; i++) {
8714 if (unicode_latin1[i]) {
8715 Py_DECREF(unicode_latin1[i]);
8716 unicode_latin1[i] = NULL;
8717 }
8718 }
8719
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008720 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 PyUnicodeObject *v = u;
8722 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008723 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008724 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008725 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008726 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008728 unicode_freelist = NULL;
8729 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008731
Walter Dörwald16807132007-05-25 13:52:07 +00008732void
8733PyUnicode_InternInPlace(PyObject **p)
8734{
8735 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8736 PyObject *t;
8737 if (s == NULL || !PyUnicode_Check(s))
8738 Py_FatalError(
8739 "PyUnicode_InternInPlace: unicode strings only please!");
8740 /* If it's a subclass, we don't really know what putting
8741 it in the interned dict might do. */
8742 if (!PyUnicode_CheckExact(s))
8743 return;
8744 if (PyUnicode_CHECK_INTERNED(s))
8745 return;
8746 if (interned == NULL) {
8747 interned = PyDict_New();
8748 if (interned == NULL) {
8749 PyErr_Clear(); /* Don't leave an exception */
8750 return;
8751 }
8752 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008753 /* It might be that the GetItem call fails even
8754 though the key is present in the dictionary,
8755 namely when this happens during a stack overflow. */
8756 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00008757 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00008758 Py_END_ALLOW_RECURSION
8759
Walter Dörwald16807132007-05-25 13:52:07 +00008760 if (t) {
8761 Py_INCREF(t);
8762 Py_DECREF(*p);
8763 *p = t;
8764 return;
8765 }
8766
Martin v. Löwis5b222132007-06-10 09:51:05 +00008767 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008768 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8769 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00008770 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008771 return;
8772 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00008773 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00008774 /* The two references in interned are not counted by refcnt.
8775 The deallocator will take care of this */
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008776 Py_Refcnt(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008777 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8778}
8779
8780void
8781PyUnicode_InternImmortal(PyObject **p)
8782{
8783 PyUnicode_InternInPlace(p);
8784 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8785 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8786 Py_INCREF(*p);
8787 }
8788}
8789
8790PyObject *
8791PyUnicode_InternFromString(const char *cp)
8792{
8793 PyObject *s = PyUnicode_FromString(cp);
8794 if (s == NULL)
8795 return NULL;
8796 PyUnicode_InternInPlace(&s);
8797 return s;
8798}
8799
8800void _Py_ReleaseInternedUnicodeStrings(void)
8801{
8802 PyObject *keys;
8803 PyUnicodeObject *s;
8804 Py_ssize_t i, n;
8805 Py_ssize_t immortal_size = 0, mortal_size = 0;
8806
8807 if (interned == NULL || !PyDict_Check(interned))
8808 return;
8809 keys = PyDict_Keys(interned);
8810 if (keys == NULL || !PyList_Check(keys)) {
8811 PyErr_Clear();
8812 return;
8813 }
8814
8815 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8816 detector, interned unicode strings are not forcibly deallocated;
8817 rather, we give them their stolen references back, and then clear
8818 and DECREF the interned dict. */
8819
8820 n = PyList_GET_SIZE(keys);
8821 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8822 n);
8823 for (i = 0; i < n; i++) {
8824 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8825 switch (s->state) {
8826 case SSTATE_NOT_INTERNED:
8827 /* XXX Shouldn't happen */
8828 break;
8829 case SSTATE_INTERNED_IMMORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008830 Py_Refcnt(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00008831 immortal_size += s->length;
8832 break;
8833 case SSTATE_INTERNED_MORTAL:
Martin v. Löwis5d7428b2007-07-21 18:47:48 +00008834 Py_Refcnt(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00008835 mortal_size += s->length;
8836 break;
8837 default:
8838 Py_FatalError("Inconsistent interned string state.");
8839 }
8840 s->state = SSTATE_NOT_INTERNED;
8841 }
8842 fprintf(stderr, "total size of all interned strings: "
8843 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8844 "mortal/immortal\n", mortal_size, immortal_size);
8845 Py_DECREF(keys);
8846 PyDict_Clear(interned);
8847 Py_DECREF(interned);
8848 interned = NULL;
8849}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008850
8851
8852/********************* Unicode Iterator **************************/
8853
8854typedef struct {
8855 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00008856 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008857 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8858} unicodeiterobject;
8859
8860static void
8861unicodeiter_dealloc(unicodeiterobject *it)
8862{
8863 _PyObject_GC_UNTRACK(it);
8864 Py_XDECREF(it->it_seq);
8865 PyObject_GC_Del(it);
8866}
8867
8868static int
8869unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8870{
8871 Py_VISIT(it->it_seq);
8872 return 0;
8873}
8874
8875static PyObject *
8876unicodeiter_next(unicodeiterobject *it)
8877{
8878 PyUnicodeObject *seq;
8879 PyObject *item;
8880
8881 assert(it != NULL);
8882 seq = it->it_seq;
8883 if (seq == NULL)
8884 return NULL;
8885 assert(PyUnicode_Check(seq));
8886
8887 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008888 item = PyUnicode_FromUnicode(
8889 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008890 if (item != NULL)
8891 ++it->it_index;
8892 return item;
8893 }
8894
8895 Py_DECREF(seq);
8896 it->it_seq = NULL;
8897 return NULL;
8898}
8899
8900static PyObject *
8901unicodeiter_len(unicodeiterobject *it)
8902{
8903 Py_ssize_t len = 0;
8904 if (it->it_seq)
8905 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8906 return PyInt_FromSsize_t(len);
8907}
8908
8909PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8910
8911static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00008912 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8913 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008914 {NULL, NULL} /* sentinel */
8915};
8916
8917PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00008918 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008919 "unicodeiterator", /* tp_name */
8920 sizeof(unicodeiterobject), /* tp_basicsize */
8921 0, /* tp_itemsize */
8922 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00008923 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00008924 0, /* tp_print */
8925 0, /* tp_getattr */
8926 0, /* tp_setattr */
8927 0, /* tp_compare */
8928 0, /* tp_repr */
8929 0, /* tp_as_number */
8930 0, /* tp_as_sequence */
8931 0, /* tp_as_mapping */
8932 0, /* tp_hash */
8933 0, /* tp_call */
8934 0, /* tp_str */
8935 PyObject_GenericGetAttr, /* tp_getattro */
8936 0, /* tp_setattro */
8937 0, /* tp_as_buffer */
8938 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8939 0, /* tp_doc */
8940 (traverseproc)unicodeiter_traverse, /* tp_traverse */
8941 0, /* tp_clear */
8942 0, /* tp_richcompare */
8943 0, /* tp_weaklistoffset */
8944 PyObject_SelfIter, /* tp_iter */
8945 (iternextfunc)unicodeiter_next, /* tp_iternext */
8946 unicodeiter_methods, /* tp_methods */
8947 0,
8948};
8949
8950static PyObject *
8951unicode_iter(PyObject *seq)
8952{
8953 unicodeiterobject *it;
8954
8955 if (!PyUnicode_Check(seq)) {
8956 PyErr_BadInternalCall();
8957 return NULL;
8958 }
8959 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8960 if (it == NULL)
8961 return NULL;
8962 it->it_index = 0;
8963 Py_INCREF(seq);
8964 it->it_seq = (PyUnicodeObject *)seq;
8965 _PyObject_GC_TRACK(it);
8966 return (PyObject *)it;
8967}
8968
Martin v. Löwis5b222132007-06-10 09:51:05 +00008969size_t
8970Py_UNICODE_strlen(const Py_UNICODE *u)
8971{
8972 int res = 0;
8973 while(*u++)
8974 res++;
8975 return res;
8976}
8977
8978Py_UNICODE*
8979Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8980{
8981 Py_UNICODE *u = s1;
8982 while ((*u++ = *s2++));
8983 return s1;
8984}
8985
8986Py_UNICODE*
8987Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8988{
8989 Py_UNICODE *u = s1;
8990 while ((*u++ = *s2++))
8991 if (n-- == 0)
8992 break;
8993 return s1;
8994}
8995
8996int
8997Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8998{
8999 while (*s1 && *s2 && *s1 == *s2)
9000 s1++, s2++;
9001 if (*s1 && *s2)
9002 return (*s1 < *s2) ? -1 : +1;
9003 if (*s1)
9004 return 1;
9005 if (*s2)
9006 return -1;
9007 return 0;
9008}
9009
9010Py_UNICODE*
9011Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9012{
9013 const Py_UNICODE *p;
9014 for (p = s; *p; p++)
9015 if (*p == c)
9016 return (Py_UNICODE*)p;
9017 return NULL;
9018}
9019
9020
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009021#ifdef __cplusplus
9022}
9023#endif
9024
9025
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009026/*
9027Local variables:
9028c-basic-offset: 4
9029indent-tabs-mode: nil
9030End:
9031*/